From 2a167b2eeb993638c198db49f3927bae5d55508b Mon Sep 17 00:00:00 2001
From: 22quinn <33176974+22quinn@users.noreply.github.com>
Date: Mon, 25 Aug 2025 09:25:52 -0700
Subject: [PATCH 001/112] [test][RL] Add sleep level 2 test and fix reload with
 sleep mode (#23521)

Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
---
 tests/basic_correctness/test_cumem.py | 31 +++++++++++++++++++++++++++
 vllm/v1/worker/gpu_worker.py          |  3 +--
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index 34f9389c82a9b..f3ad680b72b55 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -177,3 +177,34 @@ def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
 
         # cmp output
         assert output[0].outputs[0].text == output3[0].outputs[0].text
+
+
+@create_new_process_for_each_test()
+def test_deep_sleep():
+    model = "Qwen/Qwen3-0.6B"
+    free, total = torch.cuda.mem_get_info()
+    used_bytes_baseline = total - free  # in case other process is running
+    llm = LLM(model, enable_sleep_mode=True)
+    prompt = "How are you?"
+    sampling_params = SamplingParams(temperature=0, max_tokens=10)
+    output = llm.generate(prompt, sampling_params)
+
+    # Put the engine to deep sleep
+    llm.sleep(level=2)
+
+    free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
+    used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
+    assert used_bytes < 3 * GiB_bytes
+
+    llm.wake_up(tags=["weights"])
+    llm.collective_rpc("reload_weights")
+    free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info()
+    used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline
+    assert used_bytes < 4 * GiB_bytes
+
+    # now allocate kv cache and cuda graph memory
+    llm.wake_up(tags=["kv_cache"])
+    output2 = llm.generate(prompt, sampling_params)
+
+    # cmp output
+    assert output[0].outputs[0].text == output2[0].outputs[0].text
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index f83a4f4faeb5e..1688b8b83e873 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -216,8 +216,7 @@ class Worker(WorkerBase):
         self.model_runner.update_config(overrides)
 
     def reload_weights(self) -> None:
-        with self._maybe_get_memory_pool_context(tag="weights"):
-            self.model_runner.reload_weights()
+        self.model_runner.reload_weights()
 
     @torch.inference_mode()
     def determine_available_memory(self) -> int:

From 8a3cd90af534c39425ebfdfd295eea0a4582d541 Mon Sep 17 00:00:00 2001
From: Xin Yang <105740670+xyang16@users.noreply.github.com>
Date: Mon, 25 Aug 2025 11:47:52 -0700
Subject: [PATCH 002/112] [Kernel] Add fused grouped_topk kernel for MoE
 (#23274)

Signed-off-by: Xin Yang <xyangx@amazon.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
---
 CMakeLists.txt                                |   4 +-
 csrc/moe/grouped_topk_kernels.cu              | 757 ++++++++++++++++++
 csrc/moe/moe_ops.h                            |   5 +
 csrc/moe/torch_bindings.cpp                   |   6 +
 tests/kernels/moe/test_grouped_topk.py        |  76 ++
 vllm/_custom_ops.py                           |  11 +
 vllm/envs.py                                  |   6 +
 .../layers/fused_moe/fused_moe.py             |  46 +-
 8 files changed, 909 insertions(+), 2 deletions(-)
 create mode 100644 csrc/moe/grouped_topk_kernels.cu
 create mode 100644 tests/kernels/moe/test_grouped_topk.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index aca42c3fe5553..b0ed4a284db95 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -817,7 +817,9 @@ set(VLLM_MOE_EXT_SRC
   "csrc/moe/topk_softmax_kernels.cu")
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
-  list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu")
+  list(APPEND VLLM_MOE_EXT_SRC
+    "csrc/moe/moe_wna16.cu"
+    "csrc/moe/grouped_topk_kernels.cu")
 endif()
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
diff --git a/csrc/moe/grouped_topk_kernels.cu b/csrc/moe/grouped_topk_kernels.cu
new file mode 100644
index 0000000000000..78f7b3cc1aa25
--- /dev/null
+++ b/csrc/moe/grouped_topk_kernels.cu
@@ -0,0 +1,757 @@
+/*
+ * Adapted from
+ * https://github.com/NVIDIA/TensorRT-LLM/blob/v0.21.0/cpp/tensorrt_llm/kernels/noAuxTcKernels.cu
+ * Copyright (c) 2025, The vLLM team.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <c10/cuda/CUDAStream.h>
+#include <torch/all.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+namespace cg = cooperative_groups;
+
+namespace vllm {
+namespace moe {
+
+constexpr unsigned FULL_WARP_MASK = 0xffffffff;
+constexpr int32_t WARP_SIZE = 32;
+constexpr int32_t BLOCK_SIZE = 512;
+constexpr int32_t NUM_WARPS_PER_BLOCK = BLOCK_SIZE / WARP_SIZE;
+
+namespace warp_topk {
+
+template <int size, typename T>
+__host__ __device__ constexpr T round_up_to_multiple_of(T len) {
+  if (len == 0) {
+    return 0;
+  }
+  return ((len - 1) / size + 1) * size;
+}
+
+template <typename T>
+constexpr __host__ __device__ bool isPowerOf2(T v) {
+  return (v && !(v & (v - 1)));
+}
+
+template <bool greater, typename T>
+__forceinline__ __device__ bool is_better_than(T val, T baseline) {
+  return (val > baseline && greater) || (val < baseline && !greater);
+}
+
+template <bool greater, typename T, typename idxT>
+__forceinline__ __device__ bool is_better_than(T val, T baseline, idxT index,
+                                               idxT baseline_index) {
+  bool res = (val > baseline && greater) || (val < baseline && !greater);
+  if (val == baseline) {
+    res = (index < baseline_index && greater) ||
+          (index < baseline_index && !greater);
+  }
+  return res;
+}
+
+template <typename T, typename idxT>
+int calc_smem_size_for_block_wide(int num_of_warp, int64_t k) {
+  int64_t cache_topk = (sizeof(T) + sizeof(idxT)) * num_of_warp * k;
+  int64_t n = std::max<int>(num_of_warp / 2 * k, num_of_warp * WARP_SIZE);
+  return max(cache_topk,
+             round_up_to_multiple_of<256>(n * sizeof(T)) + n * sizeof(idxT));
+}
+
+template <int size, bool ascending, bool reverse, typename T, typename idxT,
+          bool is_stable>
+struct BitonicMerge {
+  // input should be a bitonic sequence, and sort it to be a monotonic sequence
+  __device__ static void merge(T* __restrict__ val_arr,
+                               idxT* __restrict__ idx_arr) {
+    static_assert(isPowerOf2(size));
+    static_assert(size >= 2 * WARP_SIZE);
+    constexpr int arr_len = size / WARP_SIZE;
+
+    constexpr int stride = arr_len / 2;
+    for (int i = 0; i < stride; ++i) {
+      int const other_i = i + stride;
+      T& val = val_arr[i];
+      T& other_val = val_arr[other_i];
+      bool is_better;
+      if constexpr (is_stable) {
+        is_better = is_better_than<ascending>(val, other_val, idx_arr[i],
+                                              idx_arr[other_i]);
+      } else {
+        is_better = is_better_than<ascending>(val, other_val);
+      }
+
+      if (is_better) {
+        T tmp = val;
+        val = other_val;
+        other_val = tmp;
+
+        idxT tmp2 = idx_arr[i];
+        idx_arr[i] = idx_arr[other_i];
+        idx_arr[other_i] = tmp2;
+      }
+    }
+
+    BitonicMerge<size / 2, ascending, reverse, T, idxT, is_stable>::merge(
+        val_arr, idx_arr);
+    BitonicMerge<size / 2, ascending, reverse, T, idxT, is_stable>::merge(
+        val_arr + arr_len / 2, idx_arr + arr_len / 2);
+  }
+};
+
+template <int size, bool ascending, typename T, typename idxT, bool is_stable>
+struct BitonicSort {
+  __device__ static void sort(T* __restrict__ val_arr,
+                              idxT* __restrict__ idx_arr) {
+    static_assert(isPowerOf2(size));
+    static_assert(size >= 2 * WARP_SIZE);
+    constexpr int arr_len = size / WARP_SIZE;
+
+    BitonicSort<size / 2, true, T, idxT, is_stable>::sort(val_arr, idx_arr);
+    BitonicSort<size / 2, false, T, idxT, is_stable>::sort(
+        val_arr + arr_len / 2, idx_arr + arr_len / 2);
+    BitonicMerge<size, ascending, ascending, T, idxT, is_stable>::merge(
+        val_arr, idx_arr);
+  }
+};
+
+template <bool ascending, typename T, typename idxT, bool is_stable>
+struct BitonicSort<32, ascending, T, idxT, is_stable> {
+  __device__ static void sort(T* __restrict__ val_arr,
+                              idxT* __restrict__ idx_arr) {
+    int const lane = threadIdx.x % WARP_SIZE;
+
+    // ascending doesn't matter before merging since all we need is a bitonic
+    // sequence
+    for (int stage = 0; stage < 4; ++stage) {
+      for (int stride = (1 << stage); stride > 0; stride /= 2) {
+        bool reverse = (lane >> stage) & 2;
+        bool is_second = lane & stride;
+
+        T other = __shfl_xor_sync(FULL_WARP_MASK, *val_arr, stride);
+        idxT other_idx = __shfl_xor_sync(FULL_WARP_MASK, *idx_arr, stride);
+
+        bool is_better;
+        if constexpr (is_stable) {
+          if constexpr (ascending) {
+            is_better = ((*val_arr > other) ||
+                         ((*val_arr == other) && (*idx_arr < other_idx))) !=
+                        (reverse != is_second);
+          } else {
+            is_better = ((*val_arr > other) ||
+                         ((*val_arr == other) && (*idx_arr > other_idx))) !=
+                        (reverse != is_second);
+          }
+        } else {
+          is_better = (*val_arr != other &&
+                       (*val_arr > other) != (reverse != is_second));
+        }
+        if (is_better) {
+          *val_arr = other;
+          *idx_arr = other_idx;
+        }
+      }
+    }
+
+    BitonicMerge<32, ascending, ascending, T, idxT, is_stable>::merge(val_arr,
+                                                                      idx_arr);
+  }
+};
+
+template <bool ascending, bool reverse, typename T, typename idxT,
+          bool is_stable>
+struct BitonicMerge<32, ascending, reverse, T, idxT, is_stable> {
+  __device__ static void merge(T* __restrict__ val_arr,
+                               idxT* __restrict__ idx_arr) {
+    int const lane = threadIdx.x % WARP_SIZE;
+    for (int stride = WARP_SIZE / 2; stride > 0; stride /= 2) {
+      bool is_second = lane & stride;
+      T& val = *val_arr;
+      T other = __shfl_xor_sync(FULL_WARP_MASK, val, stride);
+      idxT& idx = *idx_arr;
+      idxT other_idx = __shfl_xor_sync(FULL_WARP_MASK, idx, stride);
+
+      bool is_better;
+      if constexpr (is_stable) {
+        if constexpr (ascending) {
+          is_better = ((*val_arr > other) ||
+                       ((*val_arr == other) && (*idx_arr < other_idx))) ==
+                      (reverse != is_second);  // for min
+        } else {
+          is_better = ((*val_arr > other) ||
+                       ((*val_arr == other) && (*idx_arr > other_idx))) ==
+                      (reverse != is_second);  // for max
+        }
+      } else {
+        is_better =
+            (val != other && ((val > other) == (ascending != is_second)));
+      }
+
+      if (is_better) {
+        val = other;
+        idx = other_idx;
+      }
+    }
+  }
+};
+
+template <int capacity, bool greater, typename T, typename idxT, bool is_stable>
+class WarpSort {
+ public:
+  __device__ WarpSort(idxT k, T dummy)
+      : lane_(threadIdx.x % WARP_SIZE), k_(k), dummy_(dummy) {
+    static_assert(capacity >= WARP_SIZE && isPowerOf2(capacity));
+
+    for (int i = 0; i < max_arr_len_; ++i) {
+      val_arr_[i] = dummy_;
+      idx_arr_[i] = 0;
+    }
+  }
+
+  // load and merge k sorted values
+  __device__ void load_sorted(T const* __restrict__ in,
+                              idxT const* __restrict__ in_idx, idxT start) {
+    idxT idx = start + WARP_SIZE - 1 - lane_;
+    for (int i = max_arr_len_ - 1; i >= 0; --i, idx += WARP_SIZE) {
+      if (idx < start + k_) {
+        T t = in[idx];
+        bool is_better;
+        if constexpr (is_stable) {
+          is_better =
+              is_better_than<greater>(t, val_arr_[i], in_idx[idx], idx_arr_[i]);
+        } else {
+          is_better = is_better_than<greater>(t, val_arr_[i]);
+        }
+        if (is_better) {
+          val_arr_[i] = t;
+          idx_arr_[i] = in_idx[idx];
+        }
+      }
+    }
+
+    BitonicMerge<capacity, greater, !greater, T, idxT, is_stable>::merge(
+        val_arr_, idx_arr_);
+  }
+
+  __device__ void dump(T* __restrict__ out, idxT* __restrict__ out_idx) const {
+    for (int i = 0; i < max_arr_len_; ++i) {
+      idxT out_i = i * WARP_SIZE + lane_;
+      if (out_i < k_) {
+        out[out_i] = val_arr_[i];
+        out_idx[out_i] = idx_arr_[i];
+      }
+    }
+  }
+
+  __device__ void dumpIdx(idxT* __restrict__ out_idx) const {
+    for (int i = 0; i < max_arr_len_; ++i) {
+      idxT out_i = i * WARP_SIZE + lane_;
+      if (out_i < k_) {
+        out_idx[out_i] = idx_arr_[i];
+      }
+    }
+  }
+
+ protected:
+  static constexpr int max_arr_len_ = capacity / WARP_SIZE;
+
+  T val_arr_[max_arr_len_];
+  idxT idx_arr_[max_arr_len_];
+
+  int const lane_;
+  idxT const k_;
+  T const dummy_;
+
+};  // end class WarpSort
+
+template <int capacity, bool greater, typename T, typename idxT, bool is_stable>
+class WarpSelect : public WarpSort<capacity, greater, T, idxT, is_stable> {
+ public:
+  __device__ WarpSelect(idxT k, T dummy)
+      : WarpSort<capacity, greater, T, idxT, is_stable>(k, dummy),
+        k_th_(dummy),
+        k_th_lane_((k - 1) % WARP_SIZE) {
+    extern __shared__ char smem_buf[];  // extern __shared__ T smem_buf[];
+
+    int const num_of_warp = blockDim.x / WARP_SIZE;
+    int const warp_id = threadIdx.x / WARP_SIZE;
+    val_smem_ = reinterpret_cast<T*>(smem_buf);
+    val_smem_ += warp_id * WARP_SIZE;
+    idx_smem_ = reinterpret_cast<idxT*>(
+        smem_buf +
+        round_up_to_multiple_of<256>(num_of_warp * sizeof(T) * WARP_SIZE));
+    idx_smem_ += warp_id * WARP_SIZE;
+  }
+
+  __device__ void add(T const* in, idxT start, idxT end) {
+    idxT const end_for_fullwarp =
+        round_up_to_multiple_of<WARP_SIZE>(end - start) + start;
+    for (idxT i = start + lane_; i < end_for_fullwarp; i += WARP_SIZE) {
+      T val = (i < end) ? in[i] : dummy_;
+      add(val, i);
+    }
+  }
+
+  __device__ void add(T val, idxT idx) {
+    bool do_add;
+    if constexpr (is_stable) {
+      do_add = is_better_than<greater>(val, k_th_, idx, k_th_idx_);
+    } else {
+      do_add = is_better_than<greater>(val, k_th_);
+    }
+
+    uint32_t mask = __ballot_sync(FULL_WARP_MASK, do_add);
+    if (mask == 0) {
+      return;
+    }
+
+    int pos = smem_buf_len_ + __popc(mask & ((0x1u << lane_) - 1));
+    if (do_add && pos < WARP_SIZE) {
+      val_smem_[pos] = val;
+      idx_smem_[pos] = idx;
+      do_add = false;
+    }
+    smem_buf_len_ += __popc(mask);
+    if (smem_buf_len_ >= WARP_SIZE) {
+      __syncwarp();
+      merge_buf_(val_smem_[lane_], idx_smem_[lane_]);
+      smem_buf_len_ -= WARP_SIZE;
+    }
+    if (do_add) {
+      pos -= WARP_SIZE;
+      val_smem_[pos] = val;
+      idx_smem_[pos] = idx;
+    }
+    __syncwarp();
+  }
+
+  __device__ void done() {
+    if (smem_buf_len_) {
+      T val = (lane_ < smem_buf_len_) ? val_smem_[lane_] : dummy_;
+      idxT idx = (lane_ < smem_buf_len_) ? idx_smem_[lane_] : 0;
+      merge_buf_(val, idx);
+    }
+
+    // after done(), smem is used for merging results among warps
+    __syncthreads();
+  }
+
+ private:
+  __device__ void set_k_th_() {
+    k_th_ = __shfl_sync(FULL_WARP_MASK, val_arr_[max_arr_len_ - 1], k_th_lane_);
+    if constexpr (is_stable) {
+      k_th_idx_ =
+          __shfl_sync(FULL_WARP_MASK, idx_arr_[max_arr_len_ - 1], k_th_lane_);
+    }
+  }
+
+  __device__ void merge_buf_(T val, idxT idx) {
+    BitonicSort<WARP_SIZE, greater, T, idxT, is_stable>::sort(&val, &idx);
+
+    T& old = val_arr_[max_arr_len_ - 1];
+
+    bool is_better;
+    if constexpr (is_stable) {
+      is_better =
+          is_better_than<greater>(val, old, idx, idx_arr_[max_arr_len_ - 1]);
+    } else {
+      is_better = is_better_than<greater>(val, old);
+    }
+
+    if (is_better) {
+      old = val;
+      idx_arr_[max_arr_len_ - 1] = idx;
+    }
+
+    BitonicMerge<capacity, greater, !greater, T, idxT, is_stable>::merge(
+        val_arr_, idx_arr_);
+
+    set_k_th_();
+  }
+
+  using WarpSort<capacity, greater, T, idxT, is_stable>::max_arr_len_;
+  using WarpSort<capacity, greater, T, idxT, is_stable>::val_arr_;
+  using WarpSort<capacity, greater, T, idxT, is_stable>::idx_arr_;
+  using WarpSort<capacity, greater, T, idxT, is_stable>::lane_;
+  using WarpSort<capacity, greater, T, idxT, is_stable>::k_;
+  using WarpSort<capacity, greater, T, idxT, is_stable>::dummy_;
+
+  T* val_smem_;
+  idxT* idx_smem_;
+  int smem_buf_len_ = 0;
+
+  T k_th_;
+  idxT k_th_idx_;
+  int const k_th_lane_;
+};  // end class WarpSelect
+}  // namespace warp_topk
+
+template <typename T_OUT, typename T_IN>
+__device__ inline T_OUT cuda_cast(T_IN val) {
+  return val;
+}
+
+template <>
+__device__ inline float cuda_cast<float, __nv_bfloat16>(__nv_bfloat16 val) {
+  return __bfloat162float(val);
+}
+
+template <typename T>
+__device__ void topk_with_k2(T* output, T const* input,
+                             cg::thread_block_tile<32> const& tile,
+                             int32_t const lane_id,
+                             int const num_experts_per_group) {
+  // Get the top2 per thread
+  T largest = -INFINITY;
+  T second_largest = -INFINITY;
+
+  if (num_experts_per_group > WARP_SIZE) {
+    for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
+      T value = input[i];
+      if (value > largest) {
+        second_largest = largest;
+        largest = value;
+      } else if (value > second_largest) {
+        second_largest = value;
+      }
+    }
+  } else {
+    for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
+      largest = input[i];
+    }
+  }
+
+  __syncwarp();  // Ensure all threads have valid data before reduction
+  // Get the top2 warpwise
+  T max1 = cg::reduce(tile, largest, cg::greater<T>());
+
+  T max2 = max1;
+  bool equal_to_max1 = (max1 == largest);
+
+  int count_max1 = __popc(__ballot_sync(FULL_WARP_MASK, equal_to_max1));
+
+  if (count_max1 == 1) {
+    largest = (largest == max1) ? second_largest : largest;
+    max2 = cg::reduce(tile, largest, cg::greater<T>());
+  }
+
+  if (lane_id == 0) {
+    *output = max1 + max2;
+  }
+}
+
+template <typename T>
+__global__ void topk_with_k2_kernel(T* output, T* input,
+                                    int64_t const num_tokens,
+                                    int64_t const num_cases,
+                                    int64_t const n_group,
+                                    int64_t const num_experts_per_group) {
+  int32_t warp_id = threadIdx.x / WARP_SIZE;
+  int32_t lane_id = threadIdx.x % WARP_SIZE;
+
+  int32_t case_id = blockIdx.x * NUM_WARPS_PER_BLOCK + warp_id;
+  if (case_id < num_cases) {
+    input += case_id * num_experts_per_group;
+    output += case_id;
+
+    cg::thread_block block = cg::this_thread_block();
+    cg::thread_block_tile<32> tile = cg::tiled_partition<32>(block);
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+    asm volatile("griddepcontrol.wait;");
+#endif
+    topk_with_k2(output, input, tile, lane_id, num_experts_per_group);
+  }
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  asm volatile("griddepcontrol.launch_dependents;");
+#endif
+}
+
+template <typename T, typename IdxT>
+__global__ void group_idx_and_topk_idx_kernel(
+    T* scores, T const* group_scores, T* topk_values, IdxT* topk_indices,
+    T* scores_with_bias, int64_t const num_tokens, int64_t const n_group,
+    int64_t const topk_group, int64_t const topk, int64_t const num_experts,
+    int64_t const num_experts_per_group, bool renormalize,
+    double routed_scaling_factor) {
+  int32_t warp_id = threadIdx.x / WARP_SIZE;
+  int32_t lane_id = threadIdx.x % WARP_SIZE;
+  int32_t case_id =
+      blockIdx.x * NUM_WARPS_PER_BLOCK + warp_id;  // one per token
+  scores_with_bias += case_id * num_experts;
+  scores += case_id * num_experts;
+  group_scores += case_id * n_group;
+  topk_values += case_id * topk;
+  topk_indices += case_id * topk;
+
+  int32_t align_num_experts_per_group =
+      warp_topk::round_up_to_multiple_of<WARP_SIZE>(num_experts_per_group);
+
+  cg::thread_block block = cg::this_thread_block();
+  cg::thread_block_tile<32> tile = cg::tiled_partition<32>(block);
+
+  extern __shared__ char smem_buf[];  // NOTE: reuse the shared memory here to
+                                      // store the target topk idx
+  int32_t* s_topk_idx = reinterpret_cast<int32_t*>(smem_buf);
+  T* s_topk_value =
+      reinterpret_cast<T*>(s_topk_idx + NUM_WARPS_PER_BLOCK * topk) +
+      warp_id * topk;
+  s_topk_idx += warp_id * topk;
+
+  T value = cuda::std::numeric_limits<T>::min();
+  T topk_group_value = cuda::std::numeric_limits<T>::min();
+  int32_t num_equalto_topkth_group;
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  asm volatile("griddepcontrol.wait;");  // I think all prolog can be put before
+                                         // acqbulk because it's ptr arithmetic
+#endif
+
+  if (case_id < num_tokens) {
+    // calculate group_idx
+    int32_t target_num_min = WARP_SIZE - n_group + topk_group;
+    if (lane_id < n_group &&
+        (isfinite(cuda_cast<float, T>(
+            group_scores[lane_id]))))  // The check is necessary to avoid
+                                       // abnormal input
+    {
+      value = group_scores[lane_id];
+    }
+
+    int count_equal_to_top_value = WARP_SIZE - n_group;
+    int pre_count_equal_to_top_value = 0;
+    // Use loop to find the largset top_group
+    while (count_equal_to_top_value < target_num_min) {
+      __syncwarp();  // Ensure all threads have valid data before reduction
+      topk_group_value = cg::reduce(tile, value, cg::greater<T>());
+      if (value == topk_group_value) {
+        value = cuda::std::numeric_limits<T>::min();
+      }
+      pre_count_equal_to_top_value = count_equal_to_top_value;
+      count_equal_to_top_value = __popc(__ballot_sync(
+          FULL_WARP_MASK, (value == cuda::std::numeric_limits<T>::min())));
+    }
+    num_equalto_topkth_group = target_num_min - pre_count_equal_to_top_value;
+  }
+  __syncthreads();
+
+  warp_topk::WarpSelect</*capability*/ WARP_SIZE, /*greater*/ true, T, int32_t,
+                        /* is_stable */ true>
+      queue((int32_t)topk, -INFINITY);
+
+  int count_equalto_topkth_group = 0;
+  bool if_proceed_next_topk =
+      (topk_group_value != cuda::std::numeric_limits<T>::min());
+  if (case_id < num_tokens && if_proceed_next_topk) {
+    for (int i_group = 0; i_group < n_group; i_group++) {
+      if ((group_scores[i_group] > topk_group_value) ||
+          ((group_scores[i_group] == topk_group_value) &&
+           (count_equalto_topkth_group < num_equalto_topkth_group))) {
+        int32_t offset = i_group * num_experts_per_group;
+        for (int32_t i = lane_id; i < align_num_experts_per_group;
+             i += WARP_SIZE) {
+          T candidates =
+              (i < num_experts_per_group) && isfinite(cuda_cast<float, T>(
+                                                 scores_with_bias[offset + i]))
+                  ? scores_with_bias[offset + i]
+                  : cuda::std::numeric_limits<T>::min();
+          queue.add(candidates, offset + i);
+        }
+        if (group_scores[i_group] == topk_group_value) {
+          count_equalto_topkth_group++;
+        }
+      }
+    }
+    queue.done();
+    __syncwarp();
+    // Get the topk_idx
+    queue.dumpIdx(s_topk_idx);
+    __syncwarp();
+  }
+
+  // Load the valid score value
+  // Calculate the summation
+  float topk_sum = 1e-20;
+  if (case_id < num_tokens && if_proceed_next_topk) {
+    for (int i = lane_id;
+         i < warp_topk::round_up_to_multiple_of<WARP_SIZE>(topk);
+         i += WARP_SIZE) {
+      T value =
+          i < topk
+              ? scores[s_topk_idx[i]]
+              : cuda_cast<T, float>(0.0f);  // Load the valid value of expert
+      if (i < topk) {
+        s_topk_value[i] = value;
+      }
+      topk_sum += reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
+    }
+  }
+
+  __syncthreads();
+
+  if (case_id < num_tokens) {
+    if (if_proceed_next_topk) {
+      for (int i = lane_id; i < topk; i += WARP_SIZE) {
+        float value;
+        if (renormalize) {
+          value = cuda_cast<float, T>(s_topk_value[i]) / topk_sum *
+                  routed_scaling_factor;
+        } else {
+          value = cuda_cast<float, T>(s_topk_value[i]) * routed_scaling_factor;
+        }
+        topk_indices[i] = s_topk_idx[i];
+        topk_values[i] = cuda_cast<T, float>(value);
+      }
+    } else {
+      for (int i = lane_id; i < topk; i += WARP_SIZE) {
+        topk_indices[i] = i;
+        topk_values[i] = cuda_cast<T, float>(1.0f / topk);
+      }
+    }
+    // Note: when if_proceed_next_topk==false, choose the first 8 experts as the
+    // default result.
+  }
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  asm volatile("griddepcontrol.launch_dependents;");
+#endif
+}
+
+template <typename T, typename IdxT>
+void invokeNoAuxTc(T* scores, T* group_scores, T* topk_values,
+                   IdxT* topk_indices, T* scores_with_bias,
+                   int64_t const num_tokens, int64_t const num_experts,
+                   int64_t const n_group, int64_t const topk_group,
+                   int64_t const topk, bool const renormalize,
+                   double const routed_scaling_factor, bool enable_pdl = false,
+                   cudaStream_t const stream = 0) {
+  int64_t num_cases = num_tokens * n_group;
+  int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1;
+  auto* kernel_instance1 = &topk_with_k2_kernel<T>;
+  cudaLaunchConfig_t config;
+  config.gridDim = topk_with_k2_num_blocks;
+  config.blockDim = BLOCK_SIZE;
+  config.dynamicSmemBytes = 0;
+  config.stream = stream;
+  cudaLaunchAttribute attrs[1];
+  attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+  attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
+  config.numAttrs = 1;
+  config.attrs = attrs;
+  cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores_with_bias,
+                     num_tokens, num_cases, n_group, num_experts / n_group);
+
+  int64_t topk_with_k_group_num_blocks =
+      (num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1;
+  size_t dynamic_smem_in_bytes =
+      warp_topk::calc_smem_size_for_block_wide<T, int32_t>(NUM_WARPS_PER_BLOCK,
+                                                           topk);
+  auto* kernel_instance2 = &group_idx_and_topk_idx_kernel<T, IdxT>;
+  config.gridDim = topk_with_k_group_num_blocks;
+  config.blockDim = BLOCK_SIZE;
+  config.dynamicSmemBytes = dynamic_smem_in_bytes;
+  config.stream = stream;
+  attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+  attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
+  config.numAttrs = 1;
+  config.attrs = attrs;
+  cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores,
+                     topk_values, topk_indices, scores_with_bias, num_tokens,
+                     n_group, topk_group, topk, num_experts,
+                     num_experts / n_group, renormalize, routed_scaling_factor);
+}
+
+#define INSTANTIATE_NOAUX_TC(T, IdxT)                                       \
+  template void invokeNoAuxTc<T, IdxT>(                                     \
+      T * scores, T * group_scores, T * topk_values, IdxT * topk_indices,   \
+      T * scores_with_bias, int64_t const num_tokens,                       \
+      int64_t const num_experts, int64_t const n_group,                     \
+      int64_t const topk_group, int64_t const topk, bool const renormalize, \
+      double const routed_scaling_factor, bool enable_pdl,                  \
+      cudaStream_t const stream);
+
+INSTANTIATE_NOAUX_TC(float, int32_t);
+INSTANTIATE_NOAUX_TC(half, int32_t);
+INSTANTIATE_NOAUX_TC(__nv_bfloat16, int32_t);
+}  // end namespace moe
+}  // namespace vllm
+
+std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
+    torch::Tensor const& scores, torch::Tensor const& scores_with_bias,
+    int64_t n_group, int64_t topk_group, int64_t topk, bool renormalize,
+    double routed_scaling_factor) {
+  auto data_type = scores_with_bias.scalar_type();
+  auto input_size = scores_with_bias.sizes();
+  int64_t num_tokens = input_size[0];
+  int64_t num_experts = input_size[1];
+  TORCH_CHECK(input_size.size() == 2, "scores_with_bias must be a 2D Tensor");
+  TORCH_CHECK(num_experts % n_group == 0,
+              "num_experts should be divisible by n_group");
+  TORCH_CHECK(n_group <= 32,
+              "n_group should be smaller than or equal to 32 for now");
+  TORCH_CHECK(topk <= 32, "topk should be smaller than or equal to 32 for now");
+
+  torch::Tensor group_scores = torch::empty(
+      {num_tokens, n_group}, torch::dtype(data_type).device(torch::kCUDA));
+  torch::Tensor topk_values = torch::empty(
+      {num_tokens, topk}, torch::dtype(data_type).device(torch::kCUDA));
+  torch::Tensor topk_indices = torch::empty(
+      {num_tokens, topk}, torch::dtype(torch::kInt32).device(torch::kCUDA));
+
+  auto stream = c10::cuda::getCurrentCUDAStream(scores_with_bias.get_device());
+
+  switch (data_type) {
+    case torch::kFloat16:
+      // Handle Float16
+      vllm::moe::invokeNoAuxTc<half, int32_t>(
+          reinterpret_cast<half*>(scores.mutable_data_ptr()),
+          reinterpret_cast<half*>(group_scores.mutable_data_ptr()),
+          reinterpret_cast<half*>(topk_values.mutable_data_ptr()),
+          reinterpret_cast<int32_t*>(topk_indices.mutable_data_ptr()),
+          reinterpret_cast<half*>(scores_with_bias.data_ptr()), num_tokens,
+          num_experts, n_group, topk_group, topk, renormalize,
+          routed_scaling_factor, false, stream);
+      break;
+    case torch::kFloat32:
+      // Handle Float32
+      vllm::moe::invokeNoAuxTc<float, int32_t>(
+          reinterpret_cast<float*>(scores.mutable_data_ptr()),
+          reinterpret_cast<float*>(group_scores.mutable_data_ptr()),
+          reinterpret_cast<float*>(topk_values.mutable_data_ptr()),
+          reinterpret_cast<int32_t*>(topk_indices.mutable_data_ptr()),
+          reinterpret_cast<float*>(scores_with_bias.data_ptr()), num_tokens,
+          num_experts, n_group, topk_group, topk, renormalize,
+          routed_scaling_factor, false, stream);
+      break;
+    case torch::kBFloat16:
+      // Handle BFloat16
+      vllm::moe::invokeNoAuxTc<__nv_bfloat16, int32_t>(
+          reinterpret_cast<__nv_bfloat16*>(scores.mutable_data_ptr()),
+          reinterpret_cast<__nv_bfloat16*>(group_scores.mutable_data_ptr()),
+          reinterpret_cast<__nv_bfloat16*>(topk_values.mutable_data_ptr()),
+          reinterpret_cast<int32_t*>(topk_indices.mutable_data_ptr()),
+          reinterpret_cast<__nv_bfloat16*>(scores_with_bias.data_ptr()),
+          num_tokens, num_experts, n_group, topk_group, topk, renormalize,
+          routed_scaling_factor, false, stream);
+      break;
+    default:
+      // Handle other data types
+      throw std::invalid_argument(
+          "Invalid dtype, only supports float16, float32, and bfloat16");
+      break;
+  }
+  return {topk_values, topk_indices};
+}
diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h
index 661730c96867e..92fc280b362b9 100644
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@@ -22,6 +22,11 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
                              torch::Tensor num_tokens_post_pad, int64_t top_k,
                              int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N,
                              int64_t BLOCK_SIZE_K, int64_t bit);
+
+std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
+    torch::Tensor const& scores, torch::Tensor const& scores_with_bias,
+    int64_t n_group, int64_t topk_group, int64_t topk, bool renormalize,
+    double routed_scaling_factor);
 #endif
 
 bool moe_permute_unpermute_supported();
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index 7e49f68f62438..8f33d6cd666fa 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -78,6 +78,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       "output_tensor) -> ()");
   m.impl("shuffle_rows", torch::kCUDA, &shuffle_rows);
 
+  // Apply grouped topk routing to select experts.
+  m.def(
+      "grouped_topk(Tensor scores, Tensor scores_with_bias, int n_group, int "
+      "topk_group, int topk, bool renormalize, float "
+      "routed_scaling_factor) -> (Tensor, Tensor)");
+  m.impl("grouped_topk", torch::kCUDA, &grouped_topk);
 #endif
 }
 
diff --git a/tests/kernels/moe/test_grouped_topk.py b/tests/kernels/moe/test_grouped_topk.py
new file mode 100644
index 0000000000000..646e763194fd6
--- /dev/null
+++ b/tests/kernels/moe/test_grouped_topk.py
@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for the MoE grouped topk kernel
+
+Run `pytest tests/kernels/moe/test_grouped_topk.py`.
+"""
+import pytest
+import torch
+
+from vllm.model_executor.layers.fused_moe.fused_moe import (fused_grouped_topk,
+                                                            grouped_topk)
+from vllm.platforms import current_platform
+
+
+@pytest.mark.skipif(not current_platform.is_cuda(),
+                    reason="This test is skipped on non-CUDA platform.")
+@pytest.mark.parametrize("n_token", [1, 33, 64])
+@pytest.mark.parametrize("n_hidden", [1024, 2048])
+@pytest.mark.parametrize("n_expert", [16])
+@pytest.mark.parametrize("topk", [2])
+@pytest.mark.parametrize("renormalize", [True, False])
+@pytest.mark.parametrize("num_expert_group", [8])
+@pytest.mark.parametrize("topk_group", [2])
+@pytest.mark.parametrize("scoring_func", ["softmax", "sigmoid"])
+@pytest.mark.parametrize("routed_scaling_factor", [1.0, 2.5])
+@pytest.mark.parametrize("dtype",
+                         [torch.float16, torch.bfloat16, torch.float32])
+def test_grouped_topk(monkeypatch: pytest.MonkeyPatch, n_token: int,
+                      n_hidden: int, n_expert: int, topk: int,
+                      renormalize: bool, num_expert_group: int,
+                      topk_group: int, scoring_func: str,
+                      routed_scaling_factor: float, dtype: torch.dtype):
+    current_platform.seed_everything(0)
+    hidden_states = torch.randn((n_token, n_hidden),
+                                dtype=dtype,
+                                device="cuda")
+    gating_output = torch.randn((n_token, n_expert),
+                                dtype=dtype,
+                                device="cuda")
+    e_score_correction_bias = torch.randn((n_expert, ),
+                                          dtype=torch.float32,
+                                          device="cuda")
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_FUSED_MOE_GROUPED_TOPK", "0")
+        baseline_topk_weights, baseline_topk_ids = grouped_topk(
+            hidden_states=hidden_states,
+            gating_output=gating_output,
+            topk=topk,
+            renormalize=renormalize,
+            num_expert_group=num_expert_group,
+            topk_group=topk_group,
+            scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
+            e_score_correction_bias=e_score_correction_bias)
+
+        test_topk_weights, test_topk_ids = fused_grouped_topk(
+            hidden_states=hidden_states,
+            gating_output=gating_output,
+            topk=topk,
+            renormalize=renormalize,
+            num_expert_group=num_expert_group,
+            topk_group=topk_group,
+            scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
+            e_score_correction_bias=e_score_correction_bias)
+
+        if renormalize:
+            torch.testing.assert_close(baseline_topk_weights,
+                                       test_topk_weights,
+                                       atol=2e-2,
+                                       rtol=0)
+        torch.testing.assert_close(baseline_topk_ids,
+                                   test_topk_ids,
+                                   atol=0,
+                                   rtol=0)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 3e3b43ce2abe3..054dc9d985a4c 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1502,6 +1502,17 @@ def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor,
                                   gating_output)
 
 
+def grouped_topk(scores: torch.Tensor, scores_with_bias: torch.Tensor,
+                 num_expert_group: int, topk_group: int, topk: int,
+                 renormalize: bool, routed_scaling_factor: float):
+    if not current_platform.is_cuda():
+        raise NotImplementedError("The fused grouped_topk kernel is only "
+                                  "available on CUDA platforms")
+    return torch.ops._moe_C.grouped_topk(scores, scores_with_bias,
+                                         num_expert_group, topk_group, topk,
+                                         renormalize, routed_scaling_factor)
+
+
 def moe_wna16_marlin_gemm(input: torch.Tensor, output: Optional[torch.Tensor],
                           b_qweight: torch.Tensor,
                           b_bias: Optional[torch.Tensor],
diff --git a/vllm/envs.py b/vllm/envs.py
index 5d0e972f43ad0..1c9c4cdde8001 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -131,6 +131,7 @@ if TYPE_CHECKING:
     VLLM_USE_DEEP_GEMM: bool = False
     VLLM_USE_DEEP_GEMM_E8M0: bool = True
     VLLM_SKIP_DEEP_GEMM_WARMUP: bool = False
+    VLLM_USE_FUSED_MOE_GROUPED_TOPK: bool = True
     VLLM_USE_FLASHINFER_MOE_FP8: bool = False
     VLLM_USE_FLASHINFER_MOE_FP4: bool = False
     VLLM_FLASHINFER_MOE_BACKEND: str = "throughput"
@@ -963,6 +964,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_SKIP_DEEP_GEMM_WARMUP":
     lambda: bool(int(os.getenv("VLLM_SKIP_DEEP_GEMM_WARMUP", "0"))),
 
+    # Whether to use fused grouped_topk used for MoE expert selection.
+    "VLLM_USE_FUSED_MOE_GROUPED_TOPK":
+    lambda: bool(int(os.getenv("VLLM_USE_FUSED_MOE_GROUPED_TOPK", "1"))),
+
     # Allow use of FlashInfer MoE kernels for fused moe ops.
     "VLLM_USE_FLASHINFER_MOE_FP8":
     lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP8", "0"))),
@@ -1229,6 +1234,7 @@ def compute_hash() -> str:
         "VLLM_DISABLED_KERNELS",
         "VLLM_USE_DEEP_GEMM",
         "VLLM_USE_TRTLLM_FP4_GEMM",
+        "VLLM_USE_FUSED_MOE_GROUPED_TOPK",
         "VLLM_USE_FLASHINFER_MOE_FP8",
         "VLLM_USE_FLASHINFER_MOE_FP4",
         "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8",
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 02b7b65f4a025..84dafcf00d821 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -949,8 +949,23 @@ def grouped_topk(
     num_expert_group: int = 0,
     topk_group: int = 0,
     scoring_func: str = "softmax",
-    e_score_correction_bias: Optional[torch.Tensor] = None
+    routed_scaling_factor: float = 1.0,
+    e_score_correction_bias: Optional[torch.Tensor] = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
+    if envs.VLLM_USE_FUSED_MOE_GROUPED_TOPK and \
+            current_platform.is_cuda() and \
+            num_expert_group <= 32 and topk <= 32 and \
+            e_score_correction_bias is not None:
+        return fused_grouped_topk(
+            hidden_states=hidden_states,
+            gating_output=gating_output,
+            topk=topk,
+            renormalize=renormalize,
+            e_score_correction_bias=e_score_correction_bias,
+            num_expert_group=num_expert_group,
+            topk_group=topk_group,
+            scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor)
 
     assert hidden_states.size(0) == gating_output.size(0), (
         "Number of tokens mismatch")
@@ -996,9 +1011,38 @@ def grouped_topk(
     if renormalize:
         topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
 
+    topk_weights = topk_weights * routed_scaling_factor
     return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
 
 
+def fused_grouped_topk(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    e_score_correction_bias: torch.Tensor,
+    num_expert_group: int = 0,
+    topk_group: int = 0,
+    scoring_func: str = "softmax",
+    routed_scaling_factor: float = 1.0,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    assert hidden_states.size(0) == gating_output.size(0), (
+        "Number of tokens mismatch")
+
+    if scoring_func == "softmax":
+        scores = torch.softmax(gating_output, dim=-1)
+    elif scoring_func == "sigmoid":
+        scores = gating_output.sigmoid()
+    else:
+        raise ValueError(f"Unsupported scoring function: {scoring_func}")
+
+    scores_with_bias = scores + e_score_correction_bias.unsqueeze(0)
+    topk_values, topk_indices = ops.grouped_topk(
+        scores, scores_with_bias.to(scores.dtype), num_expert_group,
+        topk_group, topk, renormalize, routed_scaling_factor)
+    return topk_values.to(torch.float32), topk_indices.to(torch.int32)
+
+
 def get_config_dtype_str(
         dtype: torch.dtype,
         use_int4_w4a16: Optional[bool] = False,

From 9188ae7cb5e78e6ecf95f41b587d3b279c231609 Mon Sep 17 00:00:00 2001
From: Zhonghua Deng <abzhonghua@gmail.com>
Date: Tue, 26 Aug 2025 03:57:08 +0800
Subject: [PATCH 003/112] [Bugfix][V1][P/D]Fix the issue where repeated
 requests for the same input produce abnormal outputs for P2pNcclConnector
 (#23403)

Signed-off-by: Abatom <abzhonghua@gmail.com>
---
 .../kv_connector/v1/p2p/p2p_nccl_connector.py | 25 +++++++++++++---
 .../kv_connector/v1/p2p/p2p_nccl_engine.py    | 30 ++-----------------
 .../kv_connector/v1/p2p/tensor_memory_pool.py |  5 ++--
 3 files changed, 26 insertions(+), 34 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
index 32d0e43d71afe..25675d70fe225 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
@@ -245,16 +245,33 @@ class P2pNcclConnector(KVConnectorBase_V1):
 
         assert self.p2p_nccl_engine is not None
 
+        def extract_kv_from_layer(
+            layer: torch.Tensor,
+            slot_mapping: torch.Tensor,
+        ) -> torch.Tensor:
+            """Extract the KV cache from the layer.
+
+            Assume the shape of the layer is (2, num_pages, page_size, xxx)
+            if MLA is not used, and (num_pages, page_size, xxx) otherwise.
+            """
+            if isinstance(attn_metadata, MLACommonMetadata):
+                num_pages, page_size = layer.shape[0], layer.shape[1]
+                return layer.reshape(num_pages * page_size, -1)[slot_mapping,
+                                                                ...]
+            num_pages, page_size = layer.shape[1], layer.shape[2]
+            return layer.reshape(2, num_pages * page_size, -1)[:, slot_mapping,
+                                                               ...]
+
         connector_metadata = self._get_connector_metadata()
         assert isinstance(connector_metadata, P2pNcclConnectorMetadata)
         for request in connector_metadata.requests:
             request_id = request.request_id
             ip, port = self.parse_request_id(request_id, True)
             remote_address = ip + ":" + str(port + self._rank)
-            self.p2p_nccl_engine.send_tensor(
-                request_id + "#" + layer_name, kv_layer, remote_address,
-                request.slot_mapping,
-                isinstance(attn_metadata, MLACommonMetadata))
+
+            kv_cache = extract_kv_from_layer(kv_layer, request.slot_mapping)
+            self.p2p_nccl_engine.send_tensor(request_id + "#" + layer_name,
+                                             kv_cache, remote_address)
 
     def wait_for_save(self):
         if self.is_producer:
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py
index b94f2296dcb36..dfd95548c4632 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py
@@ -62,8 +62,6 @@ class SendQueueItem:
     tensor_id: str
     remote_address: str
     tensor: torch.Tensor
-    slot_mapping: torch.Tensor
-    is_mla: bool
 
 
 class P2pNcclEngine:
@@ -202,8 +200,6 @@ class P2pNcclEngine:
         tensor_id: str,
         tensor: torch.Tensor,
         remote_address: typing.Optional[str] = None,
-        slot_mapping: torch.Tensor = None,
-        is_mla: bool = False,
     ) -> bool:
         if remote_address is None:
             with self.recv_store_cv:
@@ -213,9 +209,7 @@ class P2pNcclEngine:
 
         item = SendQueueItem(tensor_id=tensor_id,
                              remote_address=remote_address,
-                             tensor=tensor,
-                             slot_mapping=slot_mapping,
-                             is_mla=is_mla)
+                             tensor=tensor)
 
         if self.send_type == "PUT":
             return self.send_sync(item)
@@ -433,9 +427,7 @@ class P2pNcclEngine:
         if item.remote_address not in self.socks:
             self.create_connect(item.remote_address)
 
-        with self.send_stream:
-            tensor = self.extract_kv_from_layer(item.is_mla, item.tensor,
-                                                item.slot_mapping)
+        tensor = item.tensor
 
         sock = self.socks[item.remote_address]
         comm, rank = self.comms[item.remote_address]
@@ -548,21 +540,3 @@ class P2pNcclEngine:
             self._send_thread.join()
         if self._ping_thread is not None:
             self._ping_thread.join()
-
-    @staticmethod
-    def extract_kv_from_layer(
-        is_mla: bool,
-        layer: torch.Tensor,
-        slot_mapping: torch.Tensor,
-    ) -> torch.Tensor:
-        """Extract the KV cache from the layer.
-        Assume the shape of the layer is (2, num_pages, page_size, xxx)
-        if MLA is not used, and (num_pages, page_size, xxx) otherwise.
-        """
-        if is_mla:
-            num_pages, page_size = layer.shape[0], layer.shape[1]
-            return layer.reshape(num_pages * page_size, -1)[slot_mapping, ...]
-
-        num_pages, page_size = layer.shape[1], layer.shape[2]
-        return layer.reshape(2, num_pages * page_size, -1)[:, slot_mapping,
-                                                           ...]
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py
index 02e3bc6274f60..b775276d4a846 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py
@@ -99,8 +99,9 @@ class TensorMemoryPool:
                                     addr=self.base_address)
         self.free_lists[self.max_block_size][
             initial_block.addr] = initial_block
-        logger.debug("TensorMemoryPool, base_address:", self.base_address,
-                     self.base_address % self.max_block_size)
+
+        logger.debug("TensorMemoryPool, base_address:%d, max_block_size:%d",
+                     self.base_address, self.max_block_size)
 
     def allocate(self, size: int) -> int:
         """Allocates a memory block of at least the requested size.

From 8a044754bd083671e4bb09a68b1edae9610dfccc Mon Sep 17 00:00:00 2001
From: Chaojun Zhang <chaojun.zhang@intel.com>
Date: Tue, 26 Aug 2025 04:09:26 +0800
Subject: [PATCH 004/112] [XPU] Delay BF16 check to worker init for spawn
 compatibility (#22979)

Signed-off-by: chzhang <chaojun.zhang@intel.com>
---
 vllm/platforms/cuda.py       | 20 +++++++++++++++++++
 vllm/platforms/interface.py  |  7 +++++++
 vllm/platforms/rocm.py       | 20 +++++++++++++++++++
 vllm/platforms/xpu.py        | 37 +++++++++++-------------------------
 vllm/v1/worker/gpu_worker.py | 22 +--------------------
 vllm/v1/worker/xpu_worker.py |  1 +
 6 files changed, 60 insertions(+), 47 deletions(-)

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 134ba36e5e735..c0e0fe35e4024 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -518,6 +518,26 @@ class CudaPlatformBase(Platform):
                     supported = True
         return supported
 
+    @classmethod
+    def check_if_supports_dtype(cls, torch_dtype: torch.dtype):
+        if torch_dtype == torch.bfloat16:  # noqa: SIM102
+            if not cls.has_device_capability(80):
+                capability = cls.get_device_capability()
+                gpu_name = cls.get_device_name()
+
+                if capability is None:
+                    compute_str = "does not have a compute capability"
+                else:
+                    version_str = capability.as_version_str()
+                    compute_str = f"has compute capability {version_str}"
+
+                raise ValueError(
+                    "Bfloat16 is only supported on GPUs "
+                    "with compute capability of at least 8.0. "
+                    f"Your {gpu_name} GPU {compute_str}. "
+                    "You can use float16 instead by explicitly setting the "
+                    "`dtype` flag in CLI, for example: --dtype=half.")
+
 
 # NVML utils
 # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 00bc555288e8e..f6c17de86d05a 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -572,6 +572,13 @@ class Platform:
         """
         return False
 
+    @classmethod
+    def check_if_supports_dtype(cls, torch_dtype: torch.dtype):
+        """
+        Check if the dtype is supported by the current platform.
+        """
+        raise NotImplementedError
+
 
 class UnspecifiedPlatform(Platform):
     _enum = PlatformEnum.UNSPECIFIED
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 323ec591c50a3..85b2fe2e480c8 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -462,3 +462,23 @@ class RocmPlatform(Platform):
     def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str,
                                     model_config: "ModelConfig") -> bool:
         return True
+
+    @classmethod
+    def check_if_supports_dtype(cls, torch_dtype: torch.dtype):
+        if torch_dtype == torch.bfloat16:  # noqa: SIM102
+            if not cls.has_device_capability(80):
+                capability = cls.get_device_capability()
+                gpu_name = cls.get_device_name()
+
+                if capability is None:
+                    compute_str = "does not have a compute capability"
+                else:
+                    version_str = capability.as_version_str()
+                    compute_str = f"has compute capability {version_str}"
+
+                raise ValueError(
+                    "Bfloat16 is only supported on GPUs "
+                    "with compute capability of at least 8.0. "
+                    f"Your {gpu_name} GPU {compute_str}. "
+                    "You can use float16 instead by explicitly setting the "
+                    "`dtype` flag in CLI, for example: --dtype=half.")
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index af24437f649f4..235e5d8294e52 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -97,13 +97,6 @@ class XPUPlatform(Platform):
             from vllm.config import CompilationLevel
             vllm_config.compilation_config.level = CompilationLevel.NO_COMPILATION  # noqa: E501
 
-        # Instances created using VllmConfig() typically have model_config as
-        # None by default. The modification involves adding a check to prevent
-        # potential null exceptions check and update model config.
-        if model_config is not None and model_config.dtype == torch.bfloat16 \
-            and not cls.device_support_bf16():
-            model_config.dtype = torch.float16
-
         # lazy import to avoid circular import
         from vllm.config import CUDAGraphMode
         compilation_config = vllm_config.compilation_config
@@ -162,30 +155,11 @@ class XPUPlatform(Platform):
         torch.xpu.reset_peak_memory_stats(device)
         return torch.xpu.max_memory_allocated(device)
 
-    @classmethod
-    def device_support_bf16(cls) -> bool:
-        device_name = cls.get_device_name().lower()
-        if cls.is_client_gpu_a770():
-            logger.warning("Intel Arc A770 have bfloat16 accuracy known issue,"
-                           " fallback to float16")
-            return False
-        else:
-            logger.info(
-                "Device name %s supports bfloat16. Please file an issue "
-                "if you encounter any accuracy problems with bfloat16.",
-                device_name)
-            return True
-
     @classmethod
     def is_data_center_gpu(cls) -> bool:
         device_name = cls.get_device_name().lower()
         return device_name.count("data center gpu") > 0
 
-    @classmethod
-    def is_client_gpu_a770(cls) -> bool:
-        device_name = cls.get_device_name().lower()
-        return device_name.count("a770") > 0
-
     @classmethod
     def get_device_communicator_cls(cls) -> str:
         return "vllm.distributed.device_communicators.xpu_communicator.XpuCommunicator"  # noqa
@@ -197,3 +171,14 @@ class XPUPlatform(Platform):
     @classmethod
     def device_count(cls) -> int:
         return torch.xpu.device_count()
+
+    @classmethod
+    def check_if_supports_dtype(cls, torch_dtype: torch.dtype):
+        if torch_dtype == torch.bfloat16:  # noqa: SIM102
+            device_name = cls.get_device_name().lower()
+            # client gpu a770
+            if device_name.count("a770") > 0:
+                raise ValueError(
+                    "Intel Arc A770 have bfloat16 accuracy known issue. "
+                    "You can use float16 instead by explicitly setting the "
+                    "`dtype` flag in CLI, for example: --dtype=half.")
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 1688b8b83e873..0dca45a759216 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -167,7 +167,7 @@ class Worker(WorkerBase):
             self.device = torch.device(f"cuda:{self.local_rank}")
             current_platform.set_device(self.device)
 
-            _check_if_gpu_supports_dtype(self.model_config.dtype)
+            current_platform.check_if_supports_dtype(self.model_config.dtype)
             gc.collect()
             torch.cuda.empty_cache()
 
@@ -612,23 +612,3 @@ def init_worker_distributed_environment(
                                       parallel_config.pipeline_parallel_size)
 
     ensure_kv_transfer_initialized(vllm_config)
-
-
-def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
-    # Check if the GPU supports the dtype.
-    if torch_dtype == torch.bfloat16:  # noqa: SIM102
-        if not current_platform.has_device_capability(80):
-            capability = current_platform.get_device_capability()
-            gpu_name = current_platform.get_device_name()
-
-            if capability is None:
-                compute_str = "does not have a compute capability"
-            else:
-                version_str = capability.as_version_str()
-                compute_str = f"has compute capability {version_str}"
-
-            raise ValueError(
-                "Bfloat16 is only supported on GPUs with compute capability "
-                f"of at least 8.0. Your {gpu_name} GPU {compute_str}. "
-                "You can use float16 instead by explicitly setting the "
-                "`dtype` flag in CLI, for example: --dtype=half.")
diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py
index 134d839252653..17288cda8eccf 100644
--- a/vllm/v1/worker/xpu_worker.py
+++ b/vllm/v1/worker/xpu_worker.py
@@ -145,6 +145,7 @@ class XPUWorker(Worker):
         ):
             self.device = torch.device(f"xpu:{self.local_rank}")
             current_platform.set_device(self.device)
+            current_platform.check_if_supports_dtype(self.model_config.dtype)
             torch.xpu.empty_cache()
             self.init_gpu_memory = torch.xpu.get_device_properties(
                 self.local_rank).total_memory

From c34c82b7fe5f62e771334bdafc0c4559856ce58f Mon Sep 17 00:00:00 2001
From: Pate Motter <p@temotter.com>
Date: Mon, 25 Aug 2025 14:29:16 -0700
Subject: [PATCH 005/112] [TPU][Bugfix] Fixes prompt_token_ids error in tpu
 tests. (#23574)

Signed-off-by: Pate Motter <patemotter@google.com>
---
 .buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh | 2 +-
 .buildkite/scripts/hardware_ci/run-tpu-v1-test.sh       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
index b571618f48c2b..1073a4ee30afa 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
     && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
+    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
     && python3 -m pip install --progress-bar off hf-transfer
 echo "--- Python dependencies installed ---"
 export VLLM_USE_V1=1
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
index d55a786e41e8b..505664f3aecd0 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
     && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
+    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
     && python3 -m pip install --progress-bar off hf-transfer
 echo "--- Python dependencies installed ---"
 export VLLM_USE_V1=1

From 7b6a8372755dfd6b8b2449b24e2d9d8589ff0291 Mon Sep 17 00:00:00 2001
From: Terrence Zhao <32208165+Terrencezzj@users.noreply.github.com>
Date: Mon, 25 Aug 2025 17:53:52 -0400
Subject: [PATCH 006/112] [Docs] Update Documentation of Cohere Command-A
 Models (#23584)

Signed-off-by: Terrencezzj <terrence@cohere.ai>
Signed-off-by: Abatom <abzhonghua@gmail.com>
Co-authored-by: Zhonghua Deng <abzhonghua@gmail.com>
---
 docs/models/supported_models.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 8fb1019f2bdfb..4763f2281d323 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -332,7 +332,7 @@ th {
 | `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | | |
 | `MBartForConditionalGeneration` | mBART | `facebook/mbart-large-en-ro`, `facebook/mbart-large-50`, etc. | | | |
 | `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R, Command-A | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, `CohereLabs/c4ai-command-a-03-2025`, `CohereLabs/command-a-reasoning-08-2025`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ |
 | `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | | ✅︎ | ✅︎ |

From efc88cf64a399f5459cd6256223e99672c13614d Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 25 Aug 2025 15:42:29 -0700
Subject: [PATCH 007/112] [Misc] Simplify FlashInfer attention metadata
 (#23585)

Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
---
 vllm/v1/attention/backends/flashinfer.py | 277 ++++++++++-------------
 1 file changed, 114 insertions(+), 163 deletions(-)

diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 50819bb2bb943..941d2a4d7f1ac 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -123,29 +123,9 @@ class FlashInferMetadata:
 
     num_actual_tokens: int  # Number of tokens excluding padding.
 
-    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
-    # the batch, used to index into subquery. E.g., if the subquery length
-    # is [4, 6], it is [0, 4, 10].
-    qo_indptr_cpu: torch.Tensor
-    # An example for paged_kv_indices, paged_kv_indptr:
-    # request 1, page indices [0, 5, 8]
-    # request 2, page indices [1, 6, 7]
-    # request 3, page indices [3, 4]
-    # paged_kv_indices is a concatenation of page indices of all requests:
-    # [0, 5, 8, 1, 6, 7, 3, 4]
-    # paged_kv_indptr is used to index into paged_kv_indices:
-    # [0, 3, 6, 8]
-    # The indptr of the paged kv cache, shape: [batch_size + 1] (CPU for plan)
-    paged_kv_indptr_cpu: torch.Tensor
-    # The page indices of the paged kv cache (on device for plan)
-    paged_kv_indices: torch.Tensor
-    # The number of entries in the last page of each request in
-    # the paged kv cache, shape: [batch_size] (CPU for plan)
-    paged_kv_last_page_len_cpu: torch.Tensor
     # The data type of the query
     q_data_type: torch.dtype
 
-    seq_lens_cpu: torch.Tensor
     slot_mapping: torch.Tensor
 
     # For flashinfer trtllm batch decode
@@ -164,10 +144,6 @@ class FlashInferMetadata:
 
     # For cascade attention (CPU for planning).
     use_cascade: bool
-    shared_qo_indptr_cpu: Optional[torch.Tensor] = None
-    shared_kv_page_indptr_cpu: Optional[torch.Tensor] = None
-    shared_kv_page_indices_cpu: Optional[torch.Tensor] = None
-    shared_kv_last_page_len_cpu: Optional[torch.Tensor] = None
 
     prefill_wrapper: Optional[BatchPrefillWithPagedKVCacheWrapper] = None
     decode_wrapper: Optional[BatchDecodeWithPagedKVCacheWrapper] = None
@@ -327,134 +303,6 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                 2, self._get_workspace_buffer(), get_kv_cache_layout())
         return self._cascade_wrapper
 
-    def _plan(self, attn_metadata: FlashInferMetadata):
-        if attn_metadata.use_cascade:
-            attn_metadata.cascade_wrapper = self._get_cascade_wrapper()
-            attn_metadata.cascade_wrapper.plan(
-                [
-                    attn_metadata.shared_qo_indptr_cpu,
-                    attn_metadata.qo_indptr_cpu
-                ],
-                [
-                    attn_metadata.shared_kv_page_indptr_cpu,
-                    attn_metadata.paged_kv_indptr_cpu
-                ],
-                [
-                    attn_metadata.shared_kv_page_indices_cpu,
-                    attn_metadata.paged_kv_indices
-                ],
-                [
-                    attn_metadata.shared_kv_last_page_len_cpu,
-                    attn_metadata.paged_kv_last_page_len_cpu
-                ],
-                self.num_qo_heads,
-                self.num_kv_heads,
-                self.head_dim,
-                self.page_size,
-                causal=True,
-                sm_scale=self.global_hyperparameters.sm_scale,
-                window_left=self.global_hyperparameters.window_left,
-                logits_soft_cap=self.global_hyperparameters.logits_soft_cap,
-                q_data_type=self.q_data_type,
-                kv_data_type=self.kv_cache_dtype,
-            )
-        else:
-            # Regular attention (common case).
-            # Decodes are at the front and prefills are at the back,
-            # according to reorder_batch()
-            num_prefills = attn_metadata.num_prefills
-            num_decodes = attn_metadata.num_decodes
-            if num_prefills > 0:
-                # Decodes are first so prefills start after the last decode
-                prefill_start = num_decodes
-                attn_metadata.prefill_wrapper = self._get_prefill_wrapper()
-                assert attn_metadata.qo_indptr_cpu[prefill_start:].shape[
-                    0] == num_prefills + 1
-                assert attn_metadata.paged_kv_indptr_cpu[prefill_start:].shape[
-                    0] == num_prefills + 1
-                assert attn_metadata.paged_kv_last_page_len_cpu[
-                    prefill_start:].shape[0] == num_prefills
-                # Since prefill_wrapper.run() will be called with
-                # query[num_decode_tokens:] we need to adjust the qo_indptr
-                # to be relative to the start of the prefill queries.
-                qo_indptr_cpu = attn_metadata.qo_indptr_cpu[
-                    prefill_start:] - attn_metadata.qo_indptr_cpu[prefill_start]
-                paged_kv_indptr_cpu = attn_metadata.paged_kv_indptr_cpu[
-                    prefill_start:]
-                if not attn_metadata.prefill_use_trtllm:
-                    attn_metadata.prefill_wrapper.plan(
-                        qo_indptr_cpu,
-                        paged_kv_indptr_cpu,
-                        attn_metadata.paged_kv_indices,
-                        attn_metadata.
-                        paged_kv_last_page_len_cpu[prefill_start:],
-                        self.num_qo_heads,
-                        self.num_kv_heads,
-                        self.head_dim,
-                        self.page_size,
-                        causal=True,
-                        sm_scale=self.global_hyperparameters.sm_scale,
-                        window_left=self.global_hyperparameters.window_left,
-                        logits_soft_cap=self.global_hyperparameters.
-                        logits_soft_cap,
-                        q_data_type=self.q_data_type,
-                        kv_data_type=self.kv_cache_dtype,
-                    )
-                else:
-                    attn_metadata.qo_indptr_gpu = qo_indptr_cpu.to(self.device)
-                    attn_metadata.paged_kv_indptr_gpu = paged_kv_indptr_cpu.to(
-                        self.device)
-
-            if num_decodes > 0:
-                pure_decode = num_prefills == 0
-                # possible required padding for cudagraph replay
-                use_cudagraph = (self.enable_cuda_graph and pure_decode and
-                                 num_decodes <= self._decode_cudagraph_max_bs)
-                if use_cudagraph:
-                    num_input_tokens = (
-                        self.vllm_config.pad_for_cudagraph(num_decodes))
-                    # Carefully fulfill the padding region with reasonable value
-                    # on cpu.
-                    # Make sure paged_kv_indptr_cpu is not decreasing
-                    self.paged_kv_indptr_cpu[1 + num_decodes:1 +
-                                             num_input_tokens].fill_(
-                                                 attn_metadata.
-                                                 paged_kv_indptr_cpu[-1])
-                    # Fill the remaining paged_kv_last_page_len_cpu with 1.
-                    # This is because flashinfer treats 0 as a full page
-                    # instead of empty.
-                    self.paged_kv_last_page_len_cpu[
-                        num_decodes:num_input_tokens].fill_(1)
-
-                else:
-                    num_input_tokens = num_decodes
-
-                attn_metadata.decode_wrapper = self._get_decode_wrapper(
-                    num_input_tokens, use_cudagraph)
-                if not attn_metadata.decode_use_trtllm:
-                    # Use the persistent buffer with padding length,
-                    # instead of the same address but chunked version
-                    # in atten_metadata when using cudagraph.
-                    fast_plan_decode(
-                        attn_metadata.decode_wrapper,
-                        self.paged_kv_indptr_cpu[:num_input_tokens + 1],
-                        attn_metadata.paged_kv_indices,
-                        self.paged_kv_last_page_len_cpu[:num_input_tokens],
-                        attn_metadata.seq_lens_cpu[:num_input_tokens],
-                        self.num_qo_heads,
-                        self.num_kv_heads,
-                        self.head_dim,
-                        self.page_size,
-                        # Disable flashinfer's pos encoding and use vllm's rope.
-                        pos_encoding_mode="NONE",
-                        sm_scale=self.global_hyperparameters.sm_scale,
-                        window_left=self.global_hyperparameters.window_left,
-                        logits_soft_cap=self.global_hyperparameters.
-                        logits_soft_cap,
-                        q_data_type=self.q_data_type,
-                        kv_data_type=self.kv_cache_dtype,
-                    )
-
     def build(self,
               common_prefix_len: int,
               common_attn_metadata: CommonAttentionMetadata,
@@ -548,13 +396,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
 
         attn_metadata = FlashInferMetadata(
             num_actual_tokens=num_actual_tokens,
-            qo_indptr_cpu=common_attn_metadata.query_start_loc_cpu,
-            paged_kv_indptr_cpu=self.paged_kv_indptr_cpu[:1 + num_reqs],
-            paged_kv_indices=paged_kv_indices,
-            paged_kv_last_page_len_cpu=self.
-            paged_kv_last_page_len_cpu[:num_reqs],
             q_data_type=self.q_data_type,
-            seq_lens_cpu=seq_lens_cpu,
             slot_mapping=common_attn_metadata.slot_mapping,
             max_q_len=max_q_len,
             max_seq_len=max_seq_len,
@@ -567,14 +409,123 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             num_prefills=num_prefills,
             num_prefill_tokens=num_prefill_tokens,
             use_cascade=use_cascade,
-            shared_qo_indptr_cpu=shared_qo_indptr_cpu,
-            shared_kv_page_indptr_cpu=shared_kv_page_indptr_cpu,
-            shared_kv_page_indices_cpu=shared_kv_page_indices_cpu,
-            shared_kv_last_page_len_cpu=shared_kv_last_page_len_cpu,
         )
 
-        self._plan(attn_metadata)
+        qo_indptr_cpu = common_attn_metadata.query_start_loc_cpu
+        paged_kv_indptr_cpu = self.paged_kv_indptr_cpu[:1 + num_reqs]
+        paged_kv_last_page_len_cpu = self.paged_kv_last_page_len_cpu[:num_reqs]
 
+        if attn_metadata.use_cascade:
+            attn_metadata.cascade_wrapper = self._get_cascade_wrapper()
+            attn_metadata.cascade_wrapper.plan(
+                [shared_qo_indptr_cpu, qo_indptr_cpu],
+                [shared_kv_page_indptr_cpu, paged_kv_indptr_cpu],
+                [shared_kv_page_indices_cpu, paged_kv_indices],
+                [shared_kv_last_page_len_cpu, paged_kv_last_page_len_cpu],
+                self.num_qo_heads,
+                self.num_kv_heads,
+                self.head_dim,
+                self.page_size,
+                causal=True,
+                sm_scale=self.global_hyperparameters.sm_scale,
+                window_left=self.global_hyperparameters.window_left,
+                logits_soft_cap=self.global_hyperparameters.logits_soft_cap,
+                q_data_type=self.q_data_type,
+                kv_data_type=self.kv_cache_dtype,
+            )
+        else:
+            # Regular attention (common case).
+            # Decodes are at the front and prefills are at the back,
+            # according to reorder_batch()
+            num_prefills = attn_metadata.num_prefills
+            num_decodes = attn_metadata.num_decodes
+            if num_prefills > 0:
+                # Decodes are first so prefills start after the last decode
+                prefill_start = num_decodes
+                attn_metadata.prefill_wrapper = self._get_prefill_wrapper()
+                assert qo_indptr_cpu[prefill_start:].shape[
+                    0] == num_prefills + 1
+                assert paged_kv_indptr_cpu[prefill_start:].shape[
+                    0] == num_prefills + 1
+                assert paged_kv_last_page_len_cpu[prefill_start:].shape[
+                    0] == num_prefills
+                # Since prefill_wrapper.run() will be called with
+                # query[num_decode_tokens:] we need to adjust the qo_indptr
+                # to be relative to the start of the prefill queries.
+                qo_indptr_cpu = qo_indptr_cpu[prefill_start:] - qo_indptr_cpu[
+                    prefill_start]
+                paged_kv_indptr_cpu = paged_kv_indptr_cpu[prefill_start:]
+                if not attn_metadata.prefill_use_trtllm:
+                    attn_metadata.prefill_wrapper.plan(
+                        qo_indptr_cpu,
+                        paged_kv_indptr_cpu,
+                        paged_kv_indices,
+                        paged_kv_last_page_len_cpu[prefill_start:],
+                        self.num_qo_heads,
+                        self.num_kv_heads,
+                        self.head_dim,
+                        self.page_size,
+                        causal=True,
+                        sm_scale=self.global_hyperparameters.sm_scale,
+                        window_left=self.global_hyperparameters.window_left,
+                        logits_soft_cap=self.global_hyperparameters.
+                        logits_soft_cap,
+                        q_data_type=self.q_data_type,
+                        kv_data_type=self.kv_cache_dtype,
+                    )
+                else:
+                    attn_metadata.qo_indptr_gpu = qo_indptr_cpu.to(self.device)
+                    attn_metadata.paged_kv_indptr_gpu = paged_kv_indptr_cpu.to(
+                        self.device)
+
+            if num_decodes > 0:
+                pure_decode = num_prefills == 0
+                # possible required padding for cudagraph replay
+                use_cudagraph = (self.enable_cuda_graph and pure_decode and
+                                 num_decodes <= self._decode_cudagraph_max_bs)
+                if use_cudagraph:
+                    num_input_tokens = (
+                        self.vllm_config.pad_for_cudagraph(num_decodes))
+                    # Carefully fulfill the padding region with reasonable value
+                    # on cpu.
+                    # Make sure paged_kv_indptr_cpu is not decreasing
+                    self.paged_kv_indptr_cpu[1 + num_decodes:1 +
+                                             num_input_tokens].fill_(
+                                                 paged_kv_indptr_cpu[-1])
+                    # Fill the remaining paged_kv_last_page_len_cpu with 1.
+                    # This is because flashinfer treats 0 as a full page
+                    # instead of empty.
+                    self.paged_kv_last_page_len_cpu[
+                        num_decodes:num_input_tokens].fill_(1)
+
+                else:
+                    num_input_tokens = num_decodes
+
+                attn_metadata.decode_wrapper = self._get_decode_wrapper(
+                    num_input_tokens, use_cudagraph)
+                if not attn_metadata.decode_use_trtllm:
+                    # Use the persistent buffer with padding length,
+                    # instead of the same address but chunked version
+                    # in atten_metadata when using cudagraph.
+                    fast_plan_decode(
+                        attn_metadata.decode_wrapper,
+                        self.paged_kv_indptr_cpu[:num_input_tokens + 1],
+                        paged_kv_indices,
+                        self.paged_kv_last_page_len_cpu[:num_input_tokens],
+                        seq_lens_cpu[:num_input_tokens],
+                        self.num_qo_heads,
+                        self.num_kv_heads,
+                        self.head_dim,
+                        self.page_size,
+                        # Disable flashinfer's pos encoding and use vllm's rope.
+                        pos_encoding_mode="NONE",
+                        sm_scale=self.global_hyperparameters.sm_scale,
+                        window_left=self.global_hyperparameters.window_left,
+                        logits_soft_cap=self.global_hyperparameters.
+                        logits_soft_cap,
+                        q_data_type=self.q_data_type,
+                        kv_data_type=self.kv_cache_dtype,
+                    )
         return attn_metadata
 
     def build_for_cudagraph_capture(

From 2a97ffc33de097f267f217132ced42f4714b7de5 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Mon, 25 Aug 2025 16:44:51 -0700
Subject: [PATCH 008/112] [Misc] Add release note draft to PR template (#23598)

Signed-off-by: simon-mo <simon.mo@hey.com>
---
 .github/PULL_REQUEST_TEMPLATE.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 1b30c1292df85..8043df65d5585 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -7,8 +7,6 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTT
 
 ## Test Result
 
-## (Optional) Documentation Update
-
 ---
 <details>
 <summary> Essential Elements of an Effective PR Description Checklist </summary>
@@ -17,6 +15,7 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTT
 - [ ] The test plan, such as providing test command.
 - [ ] The test results, such as pasting the results comparison before and after, or e2e results
 - [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
+- [ ] (Optional) Release notes update. If your change is user facing, please update the release notes draft in the [Google Doc](https://docs.google.com/document/d/1YyVqrgX4gHTtrstbq8oWUImOyPCKSGnJ7xtTpmXzlRs/edit?tab=t.0).
 </details>
 
 **BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)

From 906e461ed6ddccd3cc7b68fa72048d2d3fcbd72c Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Mon, 25 Aug 2025 21:29:00 -0400
Subject: [PATCH 009/112] [CI Fix] Pin deepep and pplx tags in
 tools/ep_kernels/, gate multigpu tests (#23568)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .buildkite/test-pipeline.yaml                     |  1 +
 tests/distributed/test_comm_ops.py                | 12 +++++-------
 tests/kernels/moe/test_deepep_deepgemm_moe.py     |  3 +++
 tests/kernels/moe/test_deepep_moe.py              |  3 +++
 .../moe/test_modular_kernel_combinations.py       |  2 ++
 tests/kernels/moe/test_pplx_cutlass_moe.py        |  2 ++
 tests/kernels/moe/test_pplx_moe.py                |  5 +++++
 tests/utils.py                                    |  9 ++++++---
 tools/ep_kernels/install_python_libraries.sh      | 15 +++++++++++++--
 9 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 20f3ce1adb46d..1ccfa93c571ce 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -390,6 +390,7 @@ steps:
   - csrc/moe/
   - tests/kernels/moe
   - vllm/model_executor/layers/fused_moe/
+  - vllm/distributed/device_communicators/
   commands:
     - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
   parallelism: 2
diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py
index e2cb579e22dc4..8d84cc2d0ffe6 100644
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -18,7 +18,8 @@ from vllm.distributed import (broadcast_tensor_dict, get_pp_group,
                               tensor_model_parallel_all_reduce,
                               tensor_model_parallel_reduce_scatter)
 
-from ..utils import init_test_distributed_environment, multi_process_parallel
+from ..utils import (init_test_distributed_environment, multi_gpu_test,
+                     multi_process_parallel)
 
 
 @ray.remote(num_gpus=1, max_calls=1)
@@ -226,8 +227,7 @@ def send_recv_test_worker(
         torch.testing.assert_close(test_tensor, recv_tensor)
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize("tp_size", [2])
 @pytest.mark.parametrize("test_target", [
     all_reduce_test_worker, all_gather_test_worker,
@@ -241,8 +241,7 @@ def test_multi_process_tensor_parallel(
     multi_process_parallel(monkeypatch, tp_size, 1, test_target)
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize("pp_size", [2])
 @pytest.mark.parametrize(
     "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
@@ -254,8 +253,7 @@ def test_multi_process_pipeline_parallel(
     multi_process_parallel(monkeypatch, 1, pp_size, test_target)
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 4,
-                    reason="Need at least 4 GPUs to run the test.")
+@multi_gpu_test(num_gpus=4)
 @pytest.mark.parametrize("tp_size", [2])
 @pytest.mark.parametrize("pp_size", [2])
 @pytest.mark.parametrize("test_target", [
diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py
index 6f95581a5e60d..1e922be47f2b4 100644
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -23,6 +23,7 @@ from vllm.utils import has_deep_ep, has_deep_gemm
 from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_e8m0_used,
                                   is_deep_gemm_supported)
 
+from ...utils import multi_gpu_test
 from .parallel_utils import ProcessGroupInfo, parallel_launch
 from .utils import make_test_weights
 
@@ -370,6 +371,7 @@ NUM_EXPERTS = [32]
 @pytest.mark.parametrize("num_experts", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOPKS)
 @pytest.mark.parametrize("world_dp_size", [(2, 1)])
+@multi_gpu_test(num_gpus=2)
 @requires_deep_ep
 @requires_deep_gemm
 @pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(),
@@ -427,6 +429,7 @@ USE_FP8_DISPATCH = [False]
 @pytest.mark.parametrize("use_fp8_dispatch", USE_FP8_DISPATCH)
 @pytest.mark.parametrize("block_size", [[128, 128]])
 @pytest.mark.parametrize("world_dp_size", [(2, 1)])
+@multi_gpu_test(num_gpus=2)
 @requires_deep_ep
 @requires_deep_gemm
 @pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(),
diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py
index 43804c410b6c2..6a53af68cd53a 100644
--- a/tests/kernels/moe/test_deepep_moe.py
+++ b/tests/kernels/moe/test_deepep_moe.py
@@ -24,6 +24,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
 from vllm.platforms import current_platform
 from vllm.utils import has_deep_ep
 
+from ...utils import multi_gpu_test
 from .parallel_utils import ProcessGroupInfo, parallel_launch
 
 if has_deep_ep():
@@ -411,6 +412,7 @@ DTYPES = [torch.bfloat16, torch.float8_e4m3fn]
 @pytest.mark.parametrize("topk", [6])
 @pytest.mark.parametrize("world_dp_size", [(2, 1)])
 @pytest.mark.parametrize("per_act_token_quant", [False, True])
+@multi_gpu_test(num_gpus=2)
 @requires_deep_ep
 def test_deep_ep_moe(
     dtype: torch.dtype,
@@ -459,6 +461,7 @@ USE_FP8_DISPATCH = [True, False]
 @pytest.mark.parametrize("topk", [6])
 @pytest.mark.parametrize("world_dp_size", [(2, 1)])
 @pytest.mark.parametrize("use_fp8_dispatch", USE_FP8_DISPATCH)
+@multi_gpu_test(num_gpus=2)
 @requires_deep_ep
 def test_low_latency_deep_ep_moe(dtype: torch.dtype, mnk: tuple[int, int, int],
                                  num_experts: int, topk: int,
diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py
index d45982384eb3b..6112183be5475 100644
--- a/tests/kernels/moe/test_modular_kernel_combinations.py
+++ b/tests/kernels/moe/test_modular_kernel_combinations.py
@@ -16,6 +16,7 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx
 from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
 
+from ...utils import multi_gpu_test
 from .modular_kernel_tools.common import (Config, RankTensors, WeightTensors,
                                           reference_moe_impl,
                                           run_modular_kernel)
@@ -162,6 +163,7 @@ def is_nyi_config(config: Config) -> bool:
     product(MK_MULTI_GPU_PREPARE_FINALIZE_TYPES, MK_FUSED_EXPERT_TYPES))
 @pytest.mark.parametrize("fused_moe_chunk_size", FUSED_MOE_CHUNK_SIZEs)
 @pytest.mark.parametrize("world_size", [2])
+@multi_gpu_test(num_gpus=2)
 @meets_multi_gpu_requirements
 def test_modular_kernel_combinations_multigpu(
         k: int, n: int, e: int, dtype: torch.dtype,
diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py
index 98908f2714707..9e78f4d6e4da0 100644
--- a/tests/kernels/moe/test_pplx_cutlass_moe.py
+++ b/tests/kernels/moe/test_pplx_cutlass_moe.py
@@ -17,6 +17,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
 from vllm.platforms import current_platform
 from vllm.utils import cdiv
 
+from ...utils import multi_gpu_test
 from .parallel_utils import ProcessGroupInfo, parallel_launch
 
 try:
@@ -247,6 +248,7 @@ def _pplx_moe(
 @pytest.mark.parametrize("per_out_ch", [True, False])
 @pytest.mark.parametrize("world_dp_size", [[2, 1]])  #, [4, 2]])
 @pytest.mark.parametrize("use_internode", [False])
+@multi_gpu_test(num_gpus=2)
 @pytest.mark.skipif(
     (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
         current_platform.get_device_capability()),
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
index c2064de97358f..3f36d7ada2e94 100644
--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@@ -37,6 +37,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
 from vllm.platforms import current_platform
 from vllm.utils import round_up
 
+from ...utils import multi_gpu_test
 from .parallel_utils import ProcessGroupInfo, parallel_launch
 
 requires_pplx = pytest.mark.skipif(
@@ -452,6 +453,7 @@ def _pplx_prepare_finalize(
 @pytest.mark.parametrize("use_internode", [False])
 @pytest.mark.optional
 @requires_pplx
+@multi_gpu_test(num_gpus=2)
 def test_pplx_prepare_finalize_slow(
     mnk: tuple[int, int, int],
     e: int,
@@ -740,6 +742,7 @@ def _pplx_moe(
 @pytest.mark.parametrize("use_internode", [False])
 @pytest.mark.optional
 @requires_pplx
+@multi_gpu_test(num_gpus=2)
 def test_pplx_moe_slow(
     mnk: tuple[int, int, int],
     e: int,
@@ -880,6 +883,7 @@ def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool,
 @pytest.mark.parametrize("world_dp_size", [[2, 1]])
 @pytest.mark.parametrize("use_internode", [False])
 @requires_pplx
+@multi_gpu_test(num_gpus=2)
 def test_pplx_prepare_finalize(
     world_dp_size: tuple[int, int],
     use_internode: bool,
@@ -893,6 +897,7 @@ def test_pplx_prepare_finalize(
 @pytest.mark.parametrize("world_dp_size", [[2, 1]])
 @pytest.mark.parametrize("use_internode", [False])
 @requires_pplx
+@multi_gpu_test(num_gpus=2)
 def test_pplx_moe(
     world_dp_size: tuple[int, int],
     use_internode: bool,
diff --git a/tests/utils.py b/tests/utils.py
index 4dba5494665a3..9d2073f3c1036 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -696,9 +696,12 @@ def multi_process_parallel(
     os.environ["RAY_RUNTIME_ENV_IGNORE_GITIGNORE"] = "1"
     ray.init(
         runtime_env={
-            "working_dir": VLLM_PATH,
-            "excludes":
-            ["build", ".git", "cmake-build-*", "shellcheck", "dist"]
+            "working_dir":
+            VLLM_PATH,
+            "excludes": [
+                "build", ".git", "cmake-build-*", "shellcheck", "dist",
+                "ep_kernels_workspace"
+            ]
         })
 
     distributed_init_port = get_open_port()
diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh
index e163c83e8b513..59bfe69dc0dd6 100644
--- a/tools/ep_kernels/install_python_libraries.sh
+++ b/tools/ep_kernels/install_python_libraries.sh
@@ -77,6 +77,7 @@ clone_repo() {
     local repo_url=$1
     local dir_name=$2
     local key_file=$3
+    local commit_hash=$4
 
     if [ -d "$dir_name" ]; then
         # Check if directory has uncommitted changes (dirty)
@@ -87,17 +88,27 @@ clone_repo() {
             echo "$dir_name directory exists but clone appears incomplete, cleaning up and re-cloning"
             rm -rf "$dir_name"
             git clone "$repo_url"
+            if [ -n "$commit_hash" ]; then
+                cd "$dir_name"
+                git checkout "$commit_hash"
+                cd ..
+            fi
         else
             echo "$dir_name directory exists and appears complete; manually update if needed"
         fi
     else
         git clone "$repo_url"
+        if [ -n "$commit_hash" ]; then
+            cd "$dir_name"
+            git checkout "$commit_hash"
+            cd ..
+        fi
     fi
 }
 
 # build and install pplx, require pytorch installed
 pushd $WORKSPACE
-clone_repo "https://github.com/ppl-ai/pplx-kernels" "pplx-kernels" "setup.py"
+clone_repo "https://github.com/ppl-ai/pplx-kernels" "pplx-kernels" "setup.py" "c336faf"
 cd pplx-kernels
 # see https://github.com/pypa/pip/issues/9955#issuecomment-838065925
 # PIP_NO_BUILD_ISOLATION=0 disables build isolation
@@ -106,7 +117,7 @@ popd
 
 # build and install deepep, require pytorch installed
 pushd $WORKSPACE
-clone_repo "https://github.com/deepseek-ai/DeepEP" "DeepEP" "setup.py"
+clone_repo "https://github.com/deepseek-ai/DeepEP" "DeepEP" "setup.py" "e3908bf"
 cd DeepEP
 export NVSHMEM_DIR=$WORKSPACE/nvshmem_install
 PIP_NO_BUILD_ISOLATION=0 pip install -vvv -e  .

From ae067888d6803b0fe0a2201ae9b464a848a0de01 Mon Sep 17 00:00:00 2001
From: weiliang <weiliangl@nvidia.com>
Date: Tue, 26 Aug 2025 09:30:44 +0800
Subject: [PATCH 010/112] Update Flashinfer to  0.2.14.post1 (#23537)

Signed-off-by: Siyuan Fu <siyuanf@nvidia.com>
Signed-off-by: siyuanf <siyuanf@nvidia.com>
Signed-off-by: Weiliang Liu <weiliangl@nvidia.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Siyuan Fu <siyuanf@nvidia.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 docker/Dockerfile                                | 2 +-
 setup.py                                         | 2 +-
 vllm/compilation/collective_fusion.py            | 3 ++-
 vllm/model_executor/layers/quantization/mxfp4.py | 7 ++++++-
 vllm/v1/worker/gpu_worker.py                     | 7 ++++---
 5 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 839ac501dbaf0..2e272cbca8417 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -373,7 +373,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # Install FlashInfer from source
 ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
 # Keep this in sync with "flashinfer" extra in setup.py
-ARG FLASHINFER_GIT_REF="v0.2.12"
+ARG FLASHINFER_GIT_REF="v0.2.14.post1"
 # Flag to control whether to compile FlashInfer AOT kernels
 # Set to "true" to enable AOT compilation:
 # docker build --build-arg FLASHINFER_AOT_COMPILE=true ...
diff --git a/setup.py b/setup.py
index ca6e0a8592cc2..ffe8ec4e79af7 100644
--- a/setup.py
+++ b/setup.py
@@ -694,7 +694,7 @@ setup(
                   "mistral_common[audio]"],  # Required for audio processing
         "video": [],  # Kept for backwards compatibility
         # FlashInfer should be updated together with the Dockerfile
-        "flashinfer": ["flashinfer-python==0.2.12"],
+        "flashinfer": ["flashinfer-python==0.2.14.post1"],
         # Optional deps for AMD FP4 quantization support
         "petit-kernel": ["petit-kernel"],
     },
diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py
index 6ae50245ed3a8..c44ac8e0aa7ea 100644
--- a/vllm/compilation/collective_fusion.py
+++ b/vllm/compilation/collective_fusion.py
@@ -465,7 +465,8 @@ if flashinfer_comm is not None:
                 quant_out=quant_out,
                 scale_out=scale_out,
                 # in vllm we only support swizzled layout
-                layout_code=flashinfer_comm.FP4QuantizationSFLayout.SWIZZLED,
+                layout_code=flashinfer_comm.QuantizationSFLayout.
+                SWIZZLED_128x4,
                 scale_factor=scale_factor,
             )
         else:
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 6a190ebbc063e..df96e5d8c413e 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -6,6 +6,7 @@ import torch
 from torch.nn.parameter import Parameter
 
 from vllm import envs
+from vllm.config import get_current_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig,
                                                   FusedMoEMethodBase)
@@ -113,6 +114,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
         self.topk_indices_dtype = None
         self.moe = moe
         self.use_marlin = self._should_use_marlin()
+        self.max_capture_size = get_current_vllm_config(
+        ).compilation_config.max_capture_size
 
         if current_platform.is_device_capability(100) and not has_flashinfer():
             logger.warning_once(
@@ -520,7 +523,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                 x_scale = None
             else:
                 x_quant, x_scale = mxfp8_quantize(x, False)  # to mxfp8
-                x_scale = x_scale.view(torch.float8_e4m3fn).reshape(-1)
+                x_scale = x_scale.view(torch.float8_e4m3fn).reshape(
+                    *x.shape[:-1], -1)
             trtllm_gen_output = trtllm_fp4_block_scale_moe(
                 router_logits.to(torch.bfloat16),
                 None,  # routing_bias
@@ -549,6 +553,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                 self._get_tile_tokens_dim(x, top_k),
                 1 if renormalize else 0,  # routing_method_type, renormalize
                 True,  # do finalize
+                tune_max_num_tokens=self.max_capture_size,
             )[0]
             return trtllm_gen_output
         else:
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 0dca45a759216..c252193313344 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -310,6 +310,10 @@ class Worker(WorkerBase):
             logger.info("Compile and warming up model for size %d", size)
             self.model_runner._dummy_run(size, skip_eplb=True)
 
+        # Warmup and tune the kernels used during model execution before
+        # cuda graph capture.
+        kernel_warmup(self)
+
         if not self.model_config.enforce_eager:
             self.model_runner.capture_model()
 
@@ -334,9 +338,6 @@ class Worker(WorkerBase):
                 self.model_runner._dummy_sampler_run(
                     hidden_states=last_hidden_states)
 
-        # Warmup kernels used during model execution
-        kernel_warmup(self)
-
         # Reset the seed to ensure that the random state is not affected by
         # the model initialization and profiling.
         set_random_seed(self.model_config.seed)

From 56dcf4e7e965e34043acf20ca4e4aceda21d41ec Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Mon, 25 Aug 2025 21:41:21 -0400
Subject: [PATCH 011/112] [Bug] Fix DeepGEMM Env Control (#23591)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/utils/deep_gemm.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index c0a4ed077e660..b0bc3a79eb0ad 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -27,7 +27,7 @@ def is_deep_gemm_supported() -> bool:
     is_supported_arch = current_platform.is_cuda() and (
         current_platform.is_device_capability(90)
         or current_platform.is_device_capability(100))
-    return has_deep_gemm() and is_supported_arch
+    return envs.VLLM_USE_DEEP_GEMM and has_deep_gemm() and is_supported_arch
 
 
 @functools.cache
@@ -35,12 +35,9 @@ def is_blackwell_deep_gemm_e8m0_used() -> bool:
     """Return ``True`` if vLLM is configured to use DeepGEMM "
     "E8M0 scale on a Blackwell-class GPU.
     """
-    if not (envs.VLLM_USE_DEEP_GEMM):
-        logger.debug_once("DeepGEMM E8M0 disabled: VLLM_USE_DEEP_GEMM=0.")
-        return False
-
-    if not has_deep_gemm():
-        logger.debug_once("DeepGEMM E8M0 disabled: DeepGEMM backend missing.")
+    if not is_deep_gemm_supported():
+        logger.debug_once(
+            "DeepGEMM E8M0 disabled: DeepGEMM not supported on this system.")
         return False
 
     if not envs.VLLM_USE_DEEP_GEMM_E8M0:

From 6fd45e7b8a3dc216875428835036a9008cdc0fe3 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 26 Aug 2025 10:34:12 +0800
Subject: [PATCH 012/112] [CI/Build] Use vLLM client's user agent to fetch
 images (#23561)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/entrypoints/openai/test_vision.py           | 6 ++----
 tests/entrypoints/openai/test_vision_embedding.py | 3 +--
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index 8259a81d7b6a1..eaa6c2c163af1 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -6,8 +6,6 @@ import json
 import openai
 import pytest
 import pytest_asyncio
-import requests
-from PIL import Image
 from transformers import AutoProcessor
 
 from vllm.multimodal.utils import encode_image_base64, fetch_image
@@ -36,7 +34,7 @@ EXPECTED_MM_BEAM_SEARCH_RES = [
     ],
     [
         "The image shows a Venn diagram with three over",
-        "The image shows a Venn diagram with three intersect",
+        "This image shows a Venn diagram with three intersect",
     ],
     [
         "This image displays a gradient of colors ranging from",
@@ -88,7 +86,7 @@ def get_hf_prompt_tokens(model_name, content, image_url):
         "role": "user",
         "content": f"{placeholder}{content}",
     }]
-    images = [Image.open(requests.get(image_url, stream=True).raw)]
+    images = [fetch_image(image_url)]
 
     prompt = processor.tokenizer.apply_chat_template(
         messages, tokenize=False, add_generation_prompt=True)
diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
index 4e6a21058658b..d3cc2fac6af57 100644
--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -5,7 +5,6 @@ import json
 
 import pytest
 import requests
-from PIL import Image
 from transformers import AutoProcessor
 
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
@@ -64,7 +63,7 @@ def get_hf_prompt_tokens(model_name, content, image_url):
 
     placeholder = "<|image_1|> "
     prompt = f"{placeholder}{content}"
-    images = [Image.open(requests.get(image_url, stream=True).raw)]
+    images = [fetch_image(image_url)]
     inputs = processor(prompt, images, return_tensors="pt")
     return inputs.input_ids.shape[1]
 

From 6fad29b11b3680c44782cd6e5fe555779d620d6c Mon Sep 17 00:00:00 2001
From: Copilot <198982749+Copilot@users.noreply.github.com>
Date: Mon, 25 Aug 2025 19:34:15 -0700
Subject: [PATCH 013/112] Remove graph_pool as member of VllmBackend and
 argument to CUDAGraphWrapper (#23385)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: ProExpertProg <11367180+ProExpertProg@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
---
 vllm/compilation/backends.py          | 14 ++------------
 vllm/compilation/base_static_graph.py |  5 +----
 vllm/compilation/cuda_graph.py        |  8 ++++----
 3 files changed, 7 insertions(+), 20 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 56494dffc96b3..fa86773d24743 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -294,13 +294,12 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
 
     def __init__(self, module: torch.fx.GraphModule,
                  compile_submod_names: list[str], vllm_config: VllmConfig,
-                 graph_pool, vllm_backend: "VllmBackend"):
+                 vllm_backend: "VllmBackend"):
         super().__init__(module)
         from torch._guards import detect_fake_mode
         self.fake_mode = detect_fake_mode()
         self.compile_submod_names = compile_submod_names
         self.compilation_config = vllm_config.compilation_config
-        self.graph_pool = graph_pool
         self.vllm_config = vllm_config
         self.vllm_backend = vllm_backend
         # When True, it annoyingly dumps the torch.fx.Graph on errors.
@@ -359,7 +358,6 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
                     runnable=piecewise_backend,
                     vllm_config=self.vllm_config,
                     runtime_mode=CUDAGraphMode.PIECEWISE,
-                    graph_pool=self.graph_pool,
                     cudagraph_options=CUDAGraphOptions(
                         debug_log_enable=piecewise_backend.is_first_graph,
                         gc_disable=not piecewise_backend.is_first_graph,
@@ -405,7 +403,6 @@ class VllmBackend:
 
     vllm_config: VllmConfig
     compilation_config: CompilationConfig
-    graph_pool: Any
     _called: bool = False
     # the graph we compiled
     graph: fx.GraphModule
@@ -433,13 +430,6 @@ class VllmBackend:
         # them, e.g. backbone (default), eagle_head, etc.
         self.prefix = prefix or model_tag
 
-        global_graph_pool = current_platform.get_global_graph_pool()
-
-        # TODO: in the future, if we want to use multiple
-        # streams, it might not be safe to share a global pool.
-        # only investigate this when we use multiple streams
-        self.graph_pool = global_graph_pool
-
         # Passes to run on the graph post-grad.
         self.post_grad_pass_manager = PostGradPassManager()
 
@@ -586,7 +576,7 @@ class VllmBackend:
         # propagate the split graph to the piecewise backend,
         # compile submodules with symbolic shapes
         PiecewiseCompileInterpreter(self.split_gm, submod_names_to_compile,
-                                    self.vllm_config, self.graph_pool,
+                                    self.vllm_config,
                                     self).run(*example_inputs)
 
         graph_path = os.path.join(local_cache_dir, "computation_graph.py")
diff --git a/vllm/compilation/base_static_graph.py b/vllm/compilation/base_static_graph.py
index 1c3f52c533b13..161d066ce9fb8 100644
--- a/vllm/compilation/base_static_graph.py
+++ b/vllm/compilation/base_static_graph.py
@@ -13,7 +13,7 @@ class AbstractStaticGraphWrapper(Protocol):
     """
 
     def __init__(self, runnable: Callable, vllm_config: VllmConfig,
-                 runtime_mode: CUDAGraphMode, graph_pool: Any, **kwargs):
+                 runtime_mode: CUDAGraphMode, **kwargs):
         """
         Initializes the StaticGraphWrapper class with graph capturing and
         execution-related configurations.
@@ -25,9 +25,6 @@ class AbstractStaticGraphWrapper(Protocol):
                 graph runtime. See CUDAGraphMode in vllm/config.py.
                 Note that only the subset enum `NONE`, `PIECEWISE` and `FULL`
                 are used as concrete runtime mode for cudagraph dispatching.
-            graph_pool (Any):
-                Graph memory pool handle, e.g.,
-                    `torch.cuda.graph_pool_handle()`.
         Keyword Args:
             kwargs: Additional keyword arguments for platform-specific
                 configurations.
diff --git a/vllm/compilation/cuda_graph.py b/vllm/compilation/cuda_graph.py
index 65a38197ad4e2..e233f959c0a4a 100644
--- a/vllm/compilation/cuda_graph.py
+++ b/vllm/compilation/cuda_graph.py
@@ -67,11 +67,9 @@ class CUDAGraphWrapper:
                  runnable: Callable,
                  vllm_config: VllmConfig,
                  runtime_mode: CUDAGraphMode,
-                 graph_pool: Any = None,
                  cudagraph_options: Optional[CUDAGraphOptions] = None):
         self.runnable = runnable
         self.vllm_config = vllm_config
-        self.graph_pool = graph_pool
         self.runtime_mode = runtime_mode
         self.compilation_config = vllm_config.compilation_config
 
@@ -81,8 +79,10 @@ class CUDAGraphWrapper:
         # assert runtime_mode is not NONE(no cudagraph), otherwise, we don't
         # need to initialize a CUDAGraphWrapper.
         assert self.runtime_mode != CUDAGraphMode.NONE
-        if self.graph_pool is None:
-            self.graph_pool = current_platform.get_global_graph_pool()
+        # TODO: in the future, if we want to use multiple
+        # streams, it might not be safe to share a global pool.
+        # only investigate this when we use multiple streams
+        self.graph_pool = current_platform.get_global_graph_pool()
 
         if cudagraph_options is None:
             cudagraph_options = CUDAGraphOptions()

From b395b3b0a3166d17c75e74f4eaf0ff4b15f2554f Mon Sep 17 00:00:00 2001
From: Zijing Liu <liuzijing2014@users.noreply.github.com>
Date: Mon, 25 Aug 2025 21:06:00 -0700
Subject: [PATCH 014/112] [Disagg][Perf] Use CUDA event sync instead of
 blocking `tolist` to avoid unintentional copy ops blocking across different
 CUDA streams, improving disagg TTIT/TTFT (#22760)

Signed-off-by: Zijing Liu <liuzijing2014@gmail.com>
Signed-off-by: Zijing Liu <liuzijing2014@users.noreply.github.com>
---
 vllm/v1/worker/gpu_model_runner.py | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 5d49bbaf270bb..4f6cf9a350706 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -316,6 +316,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Cached outputs.
         self._draft_token_ids: Optional[Union[list[list[int]],
                                               torch.Tensor]] = None
+        self.transfer_event = torch.cuda.Event()
+        self.sampled_token_ids_pinned_cpu = torch.empty(
+            (self.max_model_len, 1),
+            dtype=torch.int64,
+            device="cpu",
+            pin_memory=True)
 
     def _make_buffer(self, *args, dtype: torch.dtype) -> CpuGpuBuffer:
         return CpuGpuBuffer(*args,
@@ -1691,7 +1697,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         max_gen_len = sampled_token_ids.shape[-1]
         if max_gen_len == 1:
             # No spec decode tokens.
-            valid_sampled_token_ids = sampled_token_ids.tolist()
+            valid_sampled_token_ids = self._to_list(sampled_token_ids)
         else:
             # Includes spec decode tokens.
             valid_sampled_token_ids = self.rejection_sampler.parse_output(
@@ -2219,7 +2225,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 - CUDAGraphMode.PIECEWISE: Piecewise cudagraph.
                 - CUDAGraphMode.FULL: Full cudagraph, attention metadata is
                     needed.
-            force_attention: If True, always create attention metadata. Used to 
+            force_attention: If True, always create attention metadata. Used to
                 warm up attention backend when mode is NONE.
             uniform_decode: If True, the batch is a uniform decode batch.
             skip_eplb: If True, skip EPLB state update.
@@ -3233,3 +3239,18 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     mamba_type=mamba_module.mamba_type)
 
         return kv_cache_spec
+
+    def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]:
+        # This is a short term mitigation for issue mentioned in
+        # https://github.com/vllm-project/vllm/issues/22754.
+        # `tolist` would trigger a cuda wise stream sync, which
+        # would block other copy ops from other cuda streams.
+        # A cuda event sync would avoid such a situation. Since
+        # this is in the critical path of every single model
+        # forward loop, this has caused perf issue for a disagg
+        # setup.
+        pinned = self.sampled_token_ids_pinned_cpu[:sampled_token_ids.shape[0]]
+        pinned.copy_(sampled_token_ids, non_blocking=True)
+        self.transfer_event.record()
+        self.transfer_event.synchronize()
+        return pinned.tolist()

From ce0e9dbd43e798d5b27a2a379aa4e13d91a279e3 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 26 Aug 2025 14:13:03 +0800
Subject: [PATCH 015/112] [CI/Build] Fix typo in #23561 (#23616)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/entrypoints/openai/test_vision.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index eaa6c2c163af1..106ec121a422e 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -34,7 +34,7 @@ EXPECTED_MM_BEAM_SEARCH_RES = [
     ],
     [
         "The image shows a Venn diagram with three over",
-        "This image shows a Venn diagram with three intersect",
+        "The image shows a Venn diagram with three intersect",
     ],
     [
         "This image displays a gradient of colors ranging from",

From 959783fb996d0d15598f45ca12ffcbee4b681424 Mon Sep 17 00:00:00 2001
From: Bin Jia <45593998+FoolPlayer@users.noreply.github.com>
Date: Tue, 26 Aug 2025 14:16:36 +0800
Subject: [PATCH 016/112] [fix] fix seed-oss-parser (#23560)

Signed-off-by: jiabin.00 <jiabin.00@bytedance.com>
---
 tests/tool_use/test_seed_oss_tool_parser.py              | 9 ++-------
 .../openai/tool_parsers/seed_oss_tool_parser.py          | 3 +++
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/tests/tool_use/test_seed_oss_tool_parser.py b/tests/tool_use/test_seed_oss_tool_parser.py
index d85bc9bbf1b30..c276a598aa68c 100644
--- a/tests/tool_use/test_seed_oss_tool_parser.py
+++ b/tests/tool_use/test_seed_oss_tool_parser.py
@@ -102,9 +102,7 @@ def test_extract_tool_calls_no_tools(seed_oss_tool_parser):
     ],
     argnames=["model_output", "expected_tool_calls", "expected_content"],
     argvalues=[
-        ("""<seed:think>\n</seed:cot_budget_reflect>\n</seed:cot_budget_reflect>\n"""
-         """The current thinking budget is 0, so I will directly start answering the question.\n</seed:think>\n"""
-         """<seed:tool_call>\n<function=get_weather>\n"""
+        ("""<seed:tool_call>\n<function=get_weather>\n"""
          """<parameter=location>Barcelona, Spain</parameter>\n</function>\n</seed:tool_call>""",
          [
              ToolCall(function=FunctionCall(
@@ -114,10 +112,7 @@ def test_extract_tool_calls_no_tools(seed_oss_tool_parser):
                  }, ),
              ),
                       type='function')
-         ],
-         """<seed:think>\n</seed:cot_budget_reflect>\n</seed:cot_budget_reflect>\n"""
-         """The current thinking budget is 0, so I will directly start answering the question.\n</seed:think>\n"""
-         ),
+         ], None),
         (
             """<seed:think>The user\'s current thinking budget is 512.</seed:cot_budget_reflect>\nLet me analyze the """
             """question. The user wants to know the weather in Barcelona, Spain. Looking at the functions available, """
diff --git a/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
index 69cf2e68f7c41..95458f07ff2a2 100644
--- a/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
@@ -271,6 +271,9 @@ class SeedOssToolParser(ToolParser):
             # Extract content after think end token
             result_content = model_output[think_end_index:]
             thinking_content = model_output[:think_end_index]
+        else:
+            thinking_content = ""
+            result_content = model_output
 
         try:
             function_calls = self._get_function_calls(result_content)

From 7d67a9d9f93f86b74066c64c373405aa088e4a16 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 26 Aug 2025 14:50:17 +0800
Subject: [PATCH 017/112] [mypy] Fix incorrect type hint for EAGLE3 support
 (#23617)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/llama.py | 6 +++---
 vllm/model_executor/models/qwen2.py | 6 +++---
 vllm/model_executor/models/qwen3.py | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index f99f1c3643fd4..e39a6df843cd4 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -353,7 +353,7 @@ class LlamaModel(nn.Module):
         else:
             self.norm = PPMissingLayer()
 
-        self.aux_hidden_state_layers: tuple[int] = tuple()
+        self.aux_hidden_state_layers = tuple[int, ...]()
 
         self.make_empty_intermediate_tensors = (
             make_empty_intermediate_tensors_factory(
@@ -553,10 +553,10 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
-    def set_aux_hidden_state_layers(self, layers: tuple[int]) -> None:
+    def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
         self.model.aux_hidden_state_layers = layers
 
-    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int]:
+    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
         num_layers = len(self.model.layers)
         return (2, num_layers // 2, num_layers - 3)
 
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 801741ecaf3b8..27c1e68c6704b 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -333,7 +333,7 @@ class Qwen2Model(nn.Module):
         else:
             self.norm = PPMissingLayer()
 
-        self.aux_hidden_state_layers: tuple[int] = tuple()
+        self.aux_hidden_state_layers = tuple[int, ...]()
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
@@ -488,10 +488,10 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
 
-    def set_aux_hidden_state_layers(self, layers: tuple[int]) -> None:
+    def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
         self.model.aux_hidden_state_layers = layers
 
-    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int]:
+    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
         num_layers = len(self.model.layers)
         return (2, num_layers // 2, num_layers - 3)
 
diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py
index 2060206633702..dddb47048a1fc 100644
--- a/vllm/model_executor/models/qwen3.py
+++ b/vllm/model_executor/models/qwen3.py
@@ -304,10 +304,10 @@ class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
-    def set_aux_hidden_state_layers(self, layers: tuple[int]) -> None:
+    def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
         self.model.aux_hidden_state_layers = layers
 
-    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int]:
+    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
         num_layers = len(self.model.layers)
         return (2, num_layers // 2, num_layers - 3)
 

From 3ecbb14b814f9559bce88fa62ea8b5deedbc6076 Mon Sep 17 00:00:00 2001
From: Jiangyun Zhu <riverclouds.zhu@qq.com>
Date: Tue, 26 Aug 2025 14:57:08 +0800
Subject: [PATCH 018/112] [Benchmarks] add benchmark for embedding models
 (#23000)

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
---
 vllm/benchmarks/datasets.py                  |  67 +++--
 vllm/benchmarks/lib/endpoint_request_func.py |  57 +++-
 vllm/benchmarks/serve.py                     | 257 +++++++++++++------
 3 files changed, 274 insertions(+), 107 deletions(-)

diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index e586337367b1c..93519b5ba1523 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -73,7 +73,7 @@ class SampleRequest:
     Represents a single inference request for benchmarking.
     """
 
-    prompt: Union[str, Any]
+    prompt: Union[str, list[str]]
     prompt_len: int
     expected_output_len: int
     multi_modal_data: Optional[
@@ -409,6 +409,7 @@ class RandomDataset(BenchmarkDataset):
         range_ratio: float = DEFAULT_RANGE_RATIO,
         input_len: int = DEFAULT_INPUT_LEN,
         output_len: int = DEFAULT_OUTPUT_LEN,
+        batchsize: int = 1,
         **kwargs,
     ) -> list[SampleRequest]:
 
@@ -439,6 +440,21 @@ class RandomDataset(BenchmarkDataset):
                     request_id=request_id_prefix + str(i),
                 )
             )
+        # only used for embeddings benchmark.
+        if batchsize > 1:
+            batch_requests = []
+            # Create batched requests
+            for i in range(0, num_requests, batchsize):
+                batch = requests[i : i + batchsize]
+                batch_requests.append(
+                    SampleRequest(
+                        prompt=[req.prompt for req in batch],
+                        prompt_len=sum(req.prompt_len for req in batch),
+                        expected_output_len=0,
+                        request_id=request_id_prefix + str(i // batchsize),
+                    )
+                )
+            requests = batch_requests
         return requests
 
     def get_prefix(
@@ -475,8 +491,8 @@ class RandomDataset(BenchmarkDataset):
         input_high = math.ceil(real_input_len * (1 + range_ratio))
         output_low = math.floor(output_len * (1 - range_ratio))
         output_high = math.ceil(output_len * (1 + range_ratio))
-        # Ensure the lower bound for output length is at least 1 to 
-        # prevent sampling 0 tokens. 
+        # Ensure the lower bound for output length is at least 1 to
+        # prevent sampling 0 tokens.
         output_low = max(output_low, 1)
 
         if input_low > input_high:
@@ -506,7 +522,6 @@ class RandomDataset(BenchmarkDataset):
                                         size=num_requests)
         return input_lens, output_lens, offsets
 
-
     def generate_token_sequence(
         self,
         *,
@@ -1105,6 +1120,13 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
               "context length sampled from [input_len * (1 - range_ratio), "
               "input_len * (1 + range_ratio)]."),
     )
+    random_group.add_argument(
+        "--random-batch-size",
+        type=int,
+        default=1,
+        help=("Batch size for random sampling. "
+              "Only used for embeddings benchmark."),
+    )
 
     # random multimodal dataset options
     random_mm_group = parser.add_argument_group(
@@ -1196,8 +1218,6 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
         ),
     )
 
-
-
     hf_group = parser.add_argument_group("hf dataset options")
     hf_group.add_argument("--hf-subset",
                           type=str,
@@ -1348,22 +1368,24 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
     else:
         # For datasets that follow a similar structure, use a mapping.
         dataset_mapping = {
-            "sharegpt":
-            lambda: ShareGPTDataset(random_seed=args.seed,
-                                    dataset_path=args.dataset_path).sample(
-                                        tokenizer=tokenizer,
-                                        num_requests=args.num_prompts,
-                                        output_len=args.sharegpt_output_len,
-                                        request_id_prefix=args.request_id_prefix,
-                                    ),
-            "burstgpt":
-            lambda: BurstGPTDataset(random_seed=args.seed,
-                                    dataset_path=args.dataset_path).
-            sample(tokenizer=tokenizer, num_requests=args.num_prompts, 
-                   request_id_prefix=args.request_id_prefix,),
-            "random":
-            lambda: RandomDataset(random_seed=args.seed,
-                                  dataset_path=args.dataset_path).sample(
+            "sharegpt": lambda: ShareGPTDataset(
+                random_seed=args.seed, dataset_path=args.dataset_path
+            ).sample(
+                tokenizer=tokenizer,
+                num_requests=args.num_prompts,
+                output_len=args.sharegpt_output_len,
+                request_id_prefix=args.request_id_prefix,
+            ),
+            "burstgpt": lambda: BurstGPTDataset(
+                random_seed=args.seed, dataset_path=args.dataset_path
+            ).sample(
+                tokenizer=tokenizer,
+                num_requests=args.num_prompts,
+                request_id_prefix=args.request_id_prefix,
+            ),
+            "random": lambda: RandomDataset(
+                random_seed=args.seed, dataset_path=args.dataset_path
+            ).sample(
                 tokenizer=tokenizer,
                 num_requests=args.num_prompts,
                 prefix_len=args.random_prefix_len,
@@ -1371,6 +1393,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
                 output_len=args.random_output_len,
                 range_ratio=args.random_range_ratio,
                 request_id_prefix=args.request_id_prefix,
+                batchsize=args.random_batch_size,
             ),
             "random-mm":
             lambda: RandomMultiModalDataset(
diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py
index 76beded4d5189..6bb2a497119e9 100644
--- a/vllm/benchmarks/lib/endpoint_request_func.py
+++ b/vllm/benchmarks/lib/endpoint_request_func.py
@@ -69,8 +69,8 @@ async def async_request_openai_completions(
     ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
 
     payload = {
-        "model": request_func_input.model_name \
-            if request_func_input.model_name else request_func_input.model,
+        "model": request_func_input.model_name
+        if request_func_input.model_name else request_func_input.model,
         "prompt": request_func_input.prompt,
         "temperature": 0.0,
         "repetition_penalty": 1.0,
@@ -135,7 +135,7 @@ async def async_request_openai_completions(
                             # Decoding phase
                             else:
                                 output.itl.append(timestamp -
-                                                    most_recent_timestamp)
+                                                  most_recent_timestamp)
 
                             most_recent_timestamp = timestamp
                             generated_text += text or ""
@@ -254,7 +254,7 @@ async def async_request_openai_chat_completions(
                             # Decoding phase
                             else:
                                 output.itl.append(timestamp -
-                                                    most_recent_timestamp)
+                                                  most_recent_timestamp)
 
                             generated_text += content or ""
                         elif usage := data.get("usage"):
@@ -394,12 +394,61 @@ async def async_request_openai_audio(
     return output
 
 
+async def async_request_openai_embeddings(
+    request_func_input: RequestFuncInput,
+    session: aiohttp.ClientSession,
+    pbar: Optional[tqdm] = None,
+):
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        "embeddings"
+    ), "OpenAI Embeddings API URL must end with 'embeddings'."
+
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+    }
+
+    payload = {
+        "model": request_func_input.model,
+        "input": request_func_input.prompt,
+    }
+
+    output = RequestFuncOutput()
+    st = time.perf_counter()
+    try:
+        async with session.post(
+            url=api_url,
+            headers=headers,
+            json=payload
+        ) as response:
+            if response.status == 200:
+                output.latency = time.perf_counter() - st
+                data = await response.json()
+                output.success = True
+                output.generated_text = ""
+                output.prompt_len = data.get(
+                    "usage", {}).get(
+                    "prompt_tokens", 0)
+            else:
+                output.success = False
+                output.error = response.reason or ""
+    except Exception as e:
+        output.success = False
+        output.error = str(e)
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
 # TODO: Add more request functions for different API protocols.
 ASYNC_REQUEST_FUNCS = {
     "vllm": async_request_openai_completions,
     "openai": async_request_openai_completions,
     "openai-chat": async_request_openai_chat_completions,
     "openai-audio": async_request_openai_audio,
+    "openai-embeddings": async_request_openai_embeddings,
 }
 
 OPENAI_COMPATIBLE_BACKENDS = [
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index 79f2c475cbe5d..abb838316cd31 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -4,7 +4,7 @@ r"""Benchmark online serving throughput.
 
 On the server side, run one of the following commands
 to launch the vLLM OpenAI API server:
-    vllm serve <your_model> <engine arguments>        
+    vllm serve <your_model> <engine arguments>
 
 On the client side, run:
     vllm bench serve \
@@ -26,6 +26,7 @@ import warnings
 from collections.abc import AsyncGenerator, Iterable
 from dataclasses import dataclass
 from datetime import datetime
+from enum import Enum
 from typing import Any, Literal, Optional
 
 import aiohttp
@@ -46,6 +47,11 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
 
 
+class TaskType(Enum):
+    GENERATION = "generation"
+    EMBEDDING = "embedding"
+
+
 @dataclass
 class BenchmarkMetrics:
     completed: int
@@ -75,6 +81,16 @@ class BenchmarkMetrics:
     std_e2el_ms: float
     percentiles_e2el_ms: list[tuple[float, float]]
 
+@dataclass
+class EmbedBenchmarkMetrics:
+    completed: int
+    total_input: int
+    request_throughput: float
+    total_token_throughput :float
+    mean_e2el_ms: float
+    std_e2el_ms: float
+    median_e2el_ms: float
+    percentiles_e2el_ms: float
 
 def _get_current_request_rate(
     ramp_up_strategy: Optional[Literal["linear", "exponential"]],
@@ -146,11 +162,11 @@ async def get_request(
     delay_ts = []
     for request_index, request in enumerate(input_requests):
         current_request_rate = _get_current_request_rate(ramp_up_strategy,
-                                                      ramp_up_start_rps,
-                                                      ramp_up_end_rps,
-                                                      request_index,
-                                                      total_requests,
-                                                      request_rate)
+                                                         ramp_up_start_rps,
+                                                         ramp_up_end_rps,
+                                                         request_index,
+                                                         total_requests,
+                                                         request_rate)
         request_rates.append(current_request_rate)
         if current_request_rate == float("inf"):
             delay_ts.append(0)
@@ -160,7 +176,7 @@ async def get_request(
             # Sample the request interval from the gamma distribution.
             # If burstiness is 1, it follows exponential distribution.
             delay_ts.append(np.random.gamma(shape=burstiness, scale=theta))
-    
+
     # Calculate the cumulative delay time from the first sent out requests.
     for i in range(1, len(delay_ts)):
         delay_ts[i] += delay_ts[i - 1]
@@ -170,11 +186,11 @@ async def get_request(
         # logic would re-scale delay time to ensure the final delay_ts
         # align with target_total_delay_s.
         #
-        # NOTE: If we simply accumulate the random delta values 
-        # from the gamma distribution, their sum would have 1-2% gap 
+        # NOTE: If we simply accumulate the random delta values
+        # from the gamma distribution, their sum would have 1-2% gap
         # from target_total_delay_s. The purpose of the following logic is to
-        # close the gap for stablizing the throughput data 
-        # from different random seeds. 
+        # close the gap for stablizing the throughput data
+        # from different random seeds.
         target_total_delay_s = total_requests / request_rate
         normalize_factor = target_total_delay_s / delay_ts[-1]
         delay_ts = [delay * normalize_factor for delay in delay_ts]
@@ -189,6 +205,51 @@ async def get_request(
         yield request, request_rates[request_index]
 
 
+def calculate_metrics_for_embeddings(
+    outputs: list[RequestFuncOutput], 
+    dur_s: float, 
+    selected_percentiles: list[float]
+) -> EmbedBenchmarkMetrics:
+    """Calculate the metrics for the embedding requests.
+
+    Args:
+        outputs: The outputs of the requests.
+        dur_s: The duration of the benchmark.
+        selected_percentiles: The percentiles to select.
+
+    Returns:
+        The calculated benchmark metrics.
+    """
+    total_input = 0
+    completed = 0
+    e2els: list[float] = []
+    for i in range(len(outputs)):
+        if outputs[i].success:
+            e2els.append(outputs[i].latency)
+            completed += 1
+            total_input += outputs[i].prompt_len
+
+    if completed == 0:
+        warnings.warn(
+            "All requests failed. This is likely due to a misconfiguration "
+            "on the benchmark arguments.",
+            stacklevel=2)
+    metrics = EmbedBenchmarkMetrics(
+        completed=completed,
+        total_input=total_input,
+        request_throughput=completed / dur_s,
+        total_token_throughput=total_input / dur_s,
+        mean_e2el_ms=np.mean(e2els or 0) * 1000,
+        std_e2el_ms=np.std(e2els or 0) * 1000,
+        median_e2el_ms=np.median(e2els or 0) * 1000,
+        percentiles_e2el_ms=[
+            (p, np.percentile(e2els or 0, p) * 1000) 
+            for p in selected_percentiles
+        ],
+    )
+    return metrics
+
+
 def calculate_metrics(
     input_requests: list[SampleRequest],
     outputs: list[RequestFuncOutput],
@@ -334,8 +395,16 @@ async def benchmark(
     ramp_up_end_rps: Optional[int] = None,
     ready_check_timeout_sec: int = 600,
 ):
+    task_type = (
+        TaskType.EMBEDDING
+        if api_url.endswith("/v1/embeddings")
+        else TaskType.GENERATION
+    )
     if endpoint_type in ASYNC_REQUEST_FUNCS:
-        request_func = ASYNC_REQUEST_FUNCS[endpoint_type]
+        if task_type == TaskType.EMBEDDING:
+            request_func = ASYNC_REQUEST_FUNCS["openai-embeddings"]
+        else:
+            request_func = ASYNC_REQUEST_FUNCS[endpoint_type]
     else:
         raise ValueError(f"Unknown endpoint_type: {endpoint_type}")
 
@@ -421,8 +490,8 @@ async def benchmark(
         if profile_output.success:
             print("Profiler started")
 
-    distribution = ("Poisson process" if burstiness == 1.0 
-                   else "Gamma distribution")
+    distribution = ("Poisson process" if burstiness == 1.0
+                    else "Gamma distribution")
 
     if ramp_up_strategy is not None:
         print(f"Traffic ramp-up strategy: {ramp_up_strategy}.")
@@ -449,7 +518,7 @@ async def benchmark(
                                       session=session,
                                       pbar=pbar)
         async with semaphore:
-            return await request_func(request_func_input=request_func_input, 
+            return await request_func(request_func_input=request_func_input,
                                       session=session,
                                       pbar=pbar)
 
@@ -513,14 +582,22 @@ async def benchmark(
 
     benchmark_duration = time.perf_counter() - benchmark_start_time
 
-    metrics, actual_output_lens = calculate_metrics(
-        input_requests=input_requests,
-        outputs=outputs,
-        dur_s=benchmark_duration,
-        tokenizer=tokenizer,
-        selected_percentiles=selected_percentiles,
-        goodput_config_dict=goodput_config_dict,
-    )
+    if task_type == TaskType.GENERATION:
+        metrics, actual_output_lens = calculate_metrics(
+            input_requests=input_requests,
+            outputs=outputs,
+            dur_s=benchmark_duration,
+            tokenizer=tokenizer,
+            selected_percentiles=selected_percentiles,
+            goodput_config_dict=goodput_config_dict,
+        )
+    else:
+        metrics = calculate_metrics_for_embeddings(
+            outputs=outputs,
+            dur_s=benchmark_duration,
+            selected_percentiles=selected_percentiles,
+        )
+        actual_output_lens = 0
 
     print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
     print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
@@ -529,39 +606,55 @@ async def benchmark(
                                      max_concurrency))
     if request_rate != float('inf'):
         print("{:<40} {:<10.2f}".format("Request rate configured (RPS):",
-                                        request_rate ))
+                                        request_rate))
     print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
                                     benchmark_duration))
     print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
-    print("{:<40} {:<10}".format("Total generated tokens:",
-                                 metrics.total_output))
+    if isinstance(metrics, BenchmarkMetrics):
+        print("{:<40} {:<10}".format(
+            "Total generated tokens:", metrics.total_output))
     print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
                                     metrics.request_throughput))
     if goodput_config_dict:
         print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
                                         metrics.request_goodput))
-    print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
-                                    metrics.output_throughput))
+    if isinstance(metrics, BenchmarkMetrics):
+        print(
+            "{:<40} {:<10.2f}".format(
+                "Output token throughput (tok/s):", metrics.output_throughput
+            )
+        )
     print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
                                     metrics.total_token_throughput))
 
-    result = {
-        "duration": benchmark_duration,
-        "completed": metrics.completed,
-        "total_input_tokens": metrics.total_input,
-        "total_output_tokens": metrics.total_output,
-        "request_throughput": metrics.request_throughput,
-        "request_goodput":
-        metrics.request_goodput if goodput_config_dict else None,
-        "output_throughput": metrics.output_throughput,
-        "total_token_throughput": metrics.total_token_throughput,
-        "input_lens": [output.prompt_len for output in outputs],
-        "output_lens": actual_output_lens,
-        "ttfts": [output.ttft for output in outputs],
-        "itls": [output.itl for output in outputs],
-        "generated_texts": [output.generated_text for output in outputs],
-        "errors": [output.error for output in outputs],
-    }
+    if isinstance(metrics, BenchmarkMetrics):
+        result = {
+            "duration": benchmark_duration,
+            "completed": metrics.completed,
+            "total_input_tokens": metrics.total_input,
+            "total_output_tokens": metrics.total_output,
+            "request_throughput": metrics.request_throughput,
+            "request_goodput":
+            metrics.request_goodput if goodput_config_dict else None,
+            "output_throughput": metrics.output_throughput,
+            "total_token_throughput": metrics.total_token_throughput,
+            "input_lens": [output.prompt_len for output in outputs],
+            "output_lens": actual_output_lens,
+            "ttfts": [output.ttft for output in outputs],
+            "itls": [output.itl for output in outputs],
+            "generated_texts": [output.generated_text for output in outputs],
+            "errors": [output.error for output in outputs],
+        }
+    else:
+        result = {
+            "duration": benchmark_duration,
+            "completed": metrics.completed,
+            "total_input_tokens": metrics.total_input,
+            "request_throughput": metrics.request_throughput,
+            "total_token_throughput": metrics.total_token_throughput,
+            "input_lens": [output.prompt_len for output in outputs],
+            "errors": [output.error for output in outputs],
+        }
 
     if rps_change_events:
         result["rps_change_events"] = rps_change_events
@@ -598,10 +691,11 @@ async def benchmark(
                                             value))
             result[f"p{p_word}_{metric_attribute_name}_ms"] = value
 
-    process_one_metric("ttft", "TTFT", "Time to First Token")
-    process_one_metric("tpot", "TPOT",
-                       "Time per Output Token (excl. 1st token)")
-    process_one_metric("itl", "ITL", "Inter-token Latency")
+    if task_type == TaskType.GENERATION:
+        process_one_metric("ttft", "TTFT", "Time to First Token")
+        process_one_metric(
+            "tpot", "TPOT", "Time per Output Token (excl. 1st token)")
+        process_one_metric("itl", "ITL", "Inter-token Latency")
     process_one_metric("e2el", "E2EL", "End-to-end Latency")
 
     print("=" * 50)
@@ -732,7 +826,8 @@ def add_cli_args(parser: argparse.ArgumentParser):
         "initiated, this argument will control how many are actually allowed "
         "to execute at a time. This means that when used in combination, the "
         "actual request rate may be lower than specified with --request-rate, "
-        "if the server is not processing requests fast enough to keep up.")
+        "if the server is not processing requests fast enough to keep up.",
+    )
 
     parser.add_argument(
         "--model",
@@ -743,8 +838,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--tokenizer",
         type=str,
-        help=
-        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
+        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
     )
     parser.add_argument("--use-beam-search", action="store_true")
     parser.add_argument(
@@ -968,6 +1062,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
 def main(args: argparse.Namespace) -> dict[str, Any]:
     return asyncio.run(main_async(args))
 
+
 async def main_async(args: argparse.Namespace) -> dict[str, Any]:
     print(args)
     random.seed(args.seed)
@@ -1046,32 +1141,32 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
     gc.freeze()
 
     benchmark_result = await benchmark(
-            endpoint_type=args.endpoint_type,
-            api_url=api_url,
-            base_url=base_url,
-            model_id=model_id,
-            model_name=model_name,
-            tokenizer=tokenizer,
-            input_requests=input_requests,
-            logprobs=args.logprobs,
-            request_rate=args.request_rate,
-            burstiness=args.burstiness,
-            disable_tqdm=args.disable_tqdm,
-            profile=args.profile,
-            selected_percentile_metrics=args.percentile_metrics.split(","),
-            selected_percentiles=[
-                float(p) for p in args.metric_percentiles.split(",")
-            ],
-            ignore_eos=args.ignore_eos,
-            goodput_config_dict=goodput_config_dict,
-            max_concurrency=args.max_concurrency,
-            lora_modules=args.lora_modules,
-            extra_body=sampling_params,
-            ramp_up_strategy=args.ramp_up_strategy,
-            ramp_up_start_rps=args.ramp_up_start_rps,
-            ramp_up_end_rps=args.ramp_up_end_rps,
-            ready_check_timeout_sec=args.ready_check_timeout_sec,
-        )
+        endpoint_type=args.endpoint_type,
+        api_url=api_url,
+        base_url=base_url,
+        model_id=model_id,
+        model_name=model_name,
+        tokenizer=tokenizer,
+        input_requests=input_requests,
+        logprobs=args.logprobs,
+        request_rate=args.request_rate,
+        burstiness=args.burstiness,
+        disable_tqdm=args.disable_tqdm,
+        profile=args.profile,
+        selected_percentile_metrics=args.percentile_metrics.split(","),
+        selected_percentiles=[
+            float(p) for p in args.metric_percentiles.split(",")
+        ],
+        ignore_eos=args.ignore_eos,
+        goodput_config_dict=goodput_config_dict,
+        max_concurrency=args.max_concurrency,
+        lora_modules=args.lora_modules,
+        extra_body=sampling_params,
+        ramp_up_strategy=args.ramp_up_strategy,
+        ramp_up_start_rps=args.ramp_up_start_rps,
+        ramp_up_end_rps=args.ramp_up_end_rps,
+        ready_check_timeout_sec=args.ready_check_timeout_sec,
+    )
 
     # Save config and results to json
     result_json: dict[str, Any] = {}
@@ -1098,7 +1193,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
 
     # Traffic
     result_json["request_rate"] = (args.request_rate if args.request_rate
-                                    < float("inf") else "inf")
+                                   < float("inf") else "inf")
     result_json["burstiness"] = args.burstiness
     result_json["max_concurrency"] = args.max_concurrency
 
@@ -1132,7 +1227,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
                                if args.max_concurrency is not None else "")
         label = label or endpoint_type
         if args.ramp_up_strategy is not None:
-            file_name = f"{label}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa
+            file_name = f"{label}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  # noqa
         else:
             file_name = f"{label}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  # noqa
         if args.result_filename:
@@ -1149,4 +1244,4 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
             json.dump(result_json, outfile)
         save_to_pytorch_benchmark_format(args, result_json, file_name)
 
-    return result_json
\ No newline at end of file
+    return result_json

From bfc1edc9f5bde581e0eec5c830a5a4a7b710fe6a Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 26 Aug 2025 08:16:44 +0100
Subject: [PATCH 019/112] [Docs] Fix titles for multi-file examples that are
 rendered in the docs (#23573)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/mkdocs/hooks/generate_examples.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py
index 1e8b848db46d8..881df791698e2 100644
--- a/docs/mkdocs/hooks/generate_examples.py
+++ b/docs/mkdocs/hooks/generate_examples.py
@@ -70,6 +70,10 @@ class Example:
         self.other_files = self.determine_other_files()
         self.title = self.determine_title()
 
+    @property
+    def is_code(self) -> bool:
+        return self.main_file.suffix != ".md"
+
     def determine_main_file(self) -> Path:
         """
         Determines the main file in the given path.
@@ -101,6 +105,12 @@ class Example:
         return [file for file in self.path.rglob("*") if is_other_file(file)]
 
     def determine_title(self) -> str:
+        if not self.is_code:
+            with open(self.main_file) as f:
+                first_line = f.readline().strip()
+            match = re.match(r'^#\s+(?P<title>.+)$', first_line)
+            if match:
+                return match.group('title')
         return fix_case(self.path.stem.replace("_", " ").title())
 
     def generate(self) -> str:
@@ -110,11 +120,13 @@ class Example:
         # Use long code fence to avoid issues with
         # included files containing code fences too
         code_fence = "``````"
-        is_code = self.main_file.suffix != ".md"
-        if is_code:
+        # Skip the title from md snippets as it's been included above
+        start_line = 2
+        if self.is_code:
             content += f"{code_fence}{self.main_file.suffix[1:]}\n"
-        content += f'--8<-- "{self.main_file}"\n'
-        if is_code:
+            start_line = 1
+        content += f'--8<-- "{self.main_file}:{start_line}"\n'
+        if self.is_code:
             content += f"{code_fence}\n"
         content += "\n"
 

From ff77764f868290bf746d101d3998095b73e7811d Mon Sep 17 00:00:00 2001
From: Raghavan <oneraghavan@gmail.com>
Date: Tue, 26 Aug 2025 13:35:37 +0530
Subject: [PATCH 020/112] Fix CLI parameter documentation inconsistency in
 pooling_models.md (#23630)

---
 docs/models/pooling_models.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index 39f209d0eb7ed..753d8bd0b8339 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -205,12 +205,12 @@ Our [OpenAI-Compatible Server](../serving/openai_compatible_server.md) provides
 
 There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, if `is_matryoshka` is `True` in `config.json,` it is allowed to change the output to arbitrary dimensions. Using `matryoshka_dimensions` can control the allowed output dimensions.
 
-For models that support Matryoshka Embeddings but not recognized by vLLM, please manually override the config using `hf_overrides={"is_matryoshka": True}`, `hf_overrides={"matryoshka_dimensions": [<allowed output dimensions>]}` (offline) or `--hf_overrides '{"is_matryoshka": true}'`,  `--hf_overrides '{"matryoshka_dimensions": [<allowed output dimensions>]}'`(online).
+For models that support Matryoshka Embeddings but not recognized by vLLM, please manually override the config using `hf_overrides={"is_matryoshka": True}`, `hf_overrides={"matryoshka_dimensions": [<allowed output dimensions>]}` (offline) or `--hf-overrides '{"is_matryoshka": true}'`,  `--hf-overrides '{"matryoshka_dimensions": [<allowed output dimensions>]}'`(online).
 
 Here is an example to serve a model with Matryoshka Embeddings enabled.
 
 ```text
-vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf_overrides '{"matryoshka_dimensions":[256]}'
+vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf-overrides '{"matryoshka_dimensions":[256]}'
 ```
 
 ### Offline Inference

From 9b5f64238fbd0f98928587b3426cbf69eea96ae7 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 26 Aug 2025 16:09:14 +0800
Subject: [PATCH 021/112] [Bugfix] Fix Qwen25VL packed_modules_mapping (#23604)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 vllm/model_executor/models/qwen2_5_vl.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 0f11636ce3bd3..648ba81eb3877 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -853,6 +853,7 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
                                          SupportsQuant):
 
     packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
     }
 

From b5d34af3286ee0334d9f7bd729774ac55c5805e9 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Tue, 26 Aug 2025 02:46:28 -0700
Subject: [PATCH 022/112] [Bugfix] Fix scheduling when repeated images in one
 request (#23544)

Signed-off-by: Roger Wang <hey@rogerw.me>
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.me>
Co-authored-by: knlnguyen1802 <knlnguyen1802@gmail.com>
---
 tests/v1/core/test_encoder_cache_manager.py | 49 ++++++++++++++-----
 vllm/v1/core/encoder_cache_manager.py       | 32 +++++++-----
 vllm/v1/core/sched/scheduler.py             | 54 +++++++++++++++------
 3 files changed, 96 insertions(+), 39 deletions(-)

diff --git a/tests/v1/core/test_encoder_cache_manager.py b/tests/v1/core/test_encoder_cache_manager.py
index 60d932a878abb..ae5b751f45a4b 100644
--- a/tests/v1/core/test_encoder_cache_manager.py
+++ b/tests/v1/core/test_encoder_cache_manager.py
@@ -22,7 +22,7 @@ def test_basic_allocate_and_reuse():
     req = MockRequest("r1", ["imgA"], [4])
 
     assert not cache.check_and_update_cache(req, 0)
-    assert cache.try_allocate(req, 0, int(1e9))
+    assert cache.can_allocate(req, 0, int(1e9), 0)
 
     cache.allocate(req, 0)
 
@@ -44,7 +44,7 @@ def test_freeing_decreases_refcount_and_moves_to_freeable():
     manager = EncoderCacheManager(cache_size=10)
     req = MockRequest("req2", ["img3"], [5])
 
-    assert manager.try_allocate(req, 0, int(1e9))
+    assert manager.can_allocate(req, 0, int(1e9), 0)
     manager.allocate(req, 0)
 
     assert len(manager.cached["img3"]) == 1
@@ -60,10 +60,10 @@ def test_free_request_frees_all_inputs():
     manager = EncoderCacheManager(cache_size=10)
     req = MockRequest("req3", ["a", "b"], [2, 3])
 
-    assert manager.try_allocate(req, 0, int(1e9))
+    assert manager.can_allocate(req, 0, int(1e9), 0)
     manager.allocate(req, 0)
 
-    assert manager.try_allocate(req, 1, int(1e9))
+    assert manager.can_allocate(req, 1, int(1e9), 0)
     manager.allocate(req, 1)
 
     assert len(manager.cached["a"]) == 1
@@ -84,11 +84,11 @@ def test_eviction_when_cache_is_full():
     req1 = MockRequest("req1", ["x"], [6])
     req2 = MockRequest("req2", ["y"], [5])
 
-    assert manager.try_allocate(req1, 0, int(1e9))
+    assert manager.can_allocate(req1, 0, int(1e9), 0)
     manager.allocate(req1, 0)
     manager.free_encoder_input(req1, 0)
 
-    assert manager.try_allocate(req2, 0, int(1e9))
+    assert manager.can_allocate(req2, 0, int(1e9), 0)
     manager.allocate(req2, 0)
 
     # 'x' should have been evicted.
@@ -100,10 +100,10 @@ def test_get_cached_input_ids():
     manager = EncoderCacheManager(cache_size=10)
     req = MockRequest("reqX", ["m", "n", "o"], [2, 4, 3])
 
-    assert manager.try_allocate(req, 0, int(1e9))
+    assert manager.can_allocate(req, 0, int(1e9), 0)
     manager.allocate(req, 0)
 
-    assert manager.try_allocate(req, 2, int(1e9))
+    assert manager.can_allocate(req, 2, int(1e9), 0)
     manager.allocate(req, 2)
 
     cached_ids = manager.get_cached_input_ids(req)
@@ -114,7 +114,7 @@ def test_has_cache_restores_from_freeable():
     manager = EncoderCacheManager(cache_size=10)
     req = MockRequest("reqY", ["imgZ"], [4])
 
-    assert manager.try_allocate(req, 0, int(1e9))
+    assert manager.can_allocate(req, 0, int(1e9), 0)
     manager.allocate(req, 0)
 
     manager.free_encoder_input(req, 0)
@@ -131,14 +131,41 @@ def test_get_freed_mm_hashes_clears_freed_list():
     req1 = MockRequest("reqA", ["a"], [5])
     req2 = MockRequest("reqB", ["b"], [6])
 
-    assert manager.try_allocate(req1, 0, int(1e9))
+    assert manager.can_allocate(req1, 0, int(1e9), 0)
     manager.allocate(req1, 0)
     manager.free_encoder_input(req1, 0)
 
     # Should trigger eviction of 'a'.
-    assert manager.try_allocate(req2, 0, int(1e9))
+    assert manager.can_allocate(req2, 0, int(1e9), 0)
     manager.allocate(req2, 0)
 
     freed = manager.get_freed_mm_hashes()
     assert "a" in freed
     assert manager.get_freed_mm_hashes() == []
+
+
+def test_schedule_request_multi_images_respect_space_limit():
+    manager = EncoderCacheManager(cache_size=10)
+    req = MockRequest("reqA", ["a", "b"], [5, 6])
+    compute_budget = 100
+
+    num_tokens_to_schedule = 0
+    assert manager.can_allocate(req, 0, compute_budget, num_tokens_to_schedule)
+    num_tokens_to_schedule += req.get_num_encoder_tokens(0)
+    compute_budget -= req.get_num_encoder_tokens(0)
+
+    assert not manager.can_allocate(req, 1, compute_budget,
+                                    num_tokens_to_schedule)
+
+
+def test_schedule_request_multi_images_respect_compute_limit():
+    manager = EncoderCacheManager(cache_size=100)
+    req = MockRequest("reqA", ["a", "b"], [5, 6])
+    compute_budget = 10
+    num_tokens_to_schedule = 0
+    assert manager.can_allocate(req, 0, compute_budget, num_tokens_to_schedule)
+    num_tokens_to_schedule += req.get_num_encoder_tokens(0)
+    compute_budget -= req.get_num_encoder_tokens(0)
+
+    assert not manager.can_allocate(req, 1, compute_budget,
+                                    num_tokens_to_schedule)
diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py
index 70af419fcb955..c9d18033a1988 100644
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -99,8 +99,9 @@ class EncoderCacheManager:
         self.cached[mm_hash].add(request.request_id)
         return True
 
-    def try_allocate(self, request: Request, input_id: int,
-                     encoder_budget: int) -> bool:
+    def can_allocate(self, request: Request, input_id: int,
+                     encoder_compute_budget: int,
+                     num_tokens_to_schedule: int) -> bool:
         """Check if there's sufficient cache space for a multimodal input. 
         If there is, return True and update EncoderCacheManager state.
 
@@ -116,6 +117,10 @@ class EncoderCacheManager:
         Args:
             request: The request containing the multimodal input.
             input_id: Index of the multimodal input within the request.
+            encoder_compute_budget: Number of encoder tokens allowed to be 
+                computed when this method is invoked.
+            num_tokens_to_schedule: Number of tokens already scheduled to be 
+                allocated with cache space when this method is invoked.
 
         Returns:
             True if there's enough capacity to hold the encoder output for this
@@ -128,13 +133,13 @@ class EncoderCacheManager:
         num_tokens = request.get_num_encoder_tokens(input_id)
 
         # Not enough compute budget
-        if num_tokens > encoder_budget:
+        if num_tokens > encoder_compute_budget:
             return False
 
+        num_tokens += num_tokens_to_schedule
+
         # Enough free slots
         if num_tokens <= self.num_free_slots:
-            self.num_free_slots -= num_tokens
-            self.num_freeable_slots -= num_tokens
             return True
 
         # Not enough reclaimable slots
@@ -149,8 +154,6 @@ class EncoderCacheManager:
             del self.cached[mm_hash]
             self.freed.append(mm_hash)
             self.num_free_slots += num_free_token
-        self.num_free_slots -= num_tokens
-        self.num_freeable_slots -= num_tokens
         return True
 
     def allocate(self, request: Request, input_id: int) -> None:
@@ -161,19 +164,24 @@ class EncoderCacheManager:
         the model runner; this method updates the manager's bookkeeping.
 
         Note:
-            This method assumes try_allocate() returned True for the same input.
+            This method assumes can_allocate() returned True for the same input.
         """
-        # Encoder cache space budget should be already updated for the
-        # multimodal input and non-negative after try_allocate() is called.
-        assert self.num_free_slots >= 0
-        assert self.num_freeable_slots >= 0
 
         mm_hash = request.mm_hashes[input_id]
         request_id = request.request_id
         if mm_hash not in self.cached:
             self.cached[mm_hash] = set()
 
+        num_encoder_tokens = request.get_num_encoder_tokens(input_id)
+
+        # NOTE: Encoder cache should always have enough space for encoder inputs
+        # that are scheduled since eviction takes place at can_allocate().
+        assert self.num_free_slots >= num_encoder_tokens
+        assert self.num_freeable_slots >= num_encoder_tokens
+
         self.cached[mm_hash].add(request_id)
+        self.num_free_slots -= num_encoder_tokens
+        self.num_freeable_slots -= num_encoder_tokens
 
     def get_cached_input_ids(self, request: Request) -> set[int]:
         """Get all cached multimodal input IDs for a request.
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 956e23afa0d73..522b340b32aaf 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -182,7 +182,7 @@ class Scheduler(SchedulerInterface):
         token_budget = self.max_num_scheduled_tokens
         # Encoder-related.
         scheduled_encoder_inputs: dict[str, list[int]] = {}
-        encoder_budget = self.max_num_encoder_input_tokens
+        encoder_compute_budget = self.max_num_encoder_input_tokens
         # Spec decode-related.
         scheduled_spec_decode_tokens: dict[str, list[int]] = {}
 
@@ -211,12 +211,13 @@ class Scheduler(SchedulerInterface):
 
             # Schedule encoder inputs.
             encoder_inputs_to_schedule = None
-            new_encoder_budget = encoder_budget
+            new_encoder_compute_budget = encoder_compute_budget
             if request.has_encoder_inputs:
                 (encoder_inputs_to_schedule, num_new_tokens,
-                 new_encoder_budget) = self._try_schedule_encoder_inputs(
+                 new_encoder_compute_budget
+                 ) = self._try_schedule_encoder_inputs(
                      request, request.num_computed_tokens, num_new_tokens,
-                     encoder_budget)
+                     encoder_compute_budget)
 
             if num_new_tokens == 0:
                 # The request cannot be scheduled because one of the following
@@ -298,7 +299,7 @@ class Scheduler(SchedulerInterface):
                 # Allocate the encoder cache.
                 for i in encoder_inputs_to_schedule:
                     self.encoder_cache_manager.allocate(request, i)
-                encoder_budget = new_encoder_budget
+                encoder_compute_budget = new_encoder_compute_budget
 
         # Record the LoRAs in scheduled_running_reqs
         scheduled_loras: set[int] = set()
@@ -382,7 +383,7 @@ class Scheduler(SchedulerInterface):
                     num_computed_tokens = request.num_computed_tokens
 
                 encoder_inputs_to_schedule = None
-                new_encoder_budget = encoder_budget
+                new_encoder_compute_budget = encoder_compute_budget
 
                 # KVTransfer: loading remote KV, do not allocate for new work.
                 if load_kv_async:
@@ -413,10 +414,10 @@ class Scheduler(SchedulerInterface):
                     # Schedule encoder inputs.
                     if request.has_encoder_inputs:
                         (encoder_inputs_to_schedule, num_new_tokens,
-                         new_encoder_budget
+                         new_encoder_compute_budget
                          ) = self._try_schedule_encoder_inputs(
                              request, num_computed_tokens, num_new_tokens,
-                             encoder_budget)
+                             encoder_compute_budget)
                         if num_new_tokens == 0:
                             # The request cannot be scheduled.
                             break
@@ -495,7 +496,7 @@ class Scheduler(SchedulerInterface):
                     # Allocate the encoder cache.
                     for i in encoder_inputs_to_schedule:
                         self.encoder_cache_manager.allocate(request, i)
-                    encoder_budget = new_encoder_budget
+                    encoder_compute_budget = new_encoder_compute_budget
 
         # Put back any skipped requests at the head of the waiting queue
         if skipped_waiting_requests:
@@ -658,7 +659,7 @@ class Scheduler(SchedulerInterface):
         request: Request,
         num_computed_tokens: int,
         num_new_tokens: int,
-        encoder_budget: int,
+        encoder_compute_budget: int,
     ) -> tuple[list[int], int, int]:
         """
         Determine which encoder inputs need to be scheduled in the current step,
@@ -680,11 +681,17 @@ class Scheduler(SchedulerInterface):
         blocks and externally cached blocks (via KVConnector).
         """
         if num_new_tokens == 0 or not request.has_encoder_inputs:
-            return [], num_new_tokens, encoder_budget
+            return [], num_new_tokens, encoder_compute_budget
         encoder_inputs_to_schedule: list[int] = []
         mm_positions = request.mm_positions
         assert mm_positions is not None
         assert len(mm_positions) > 0
+
+        # NOTE: since scheduler operates on the request level (possibly with
+        # multiple encoder inputs per request), we need to create temporary
+        # trackers for accounting at the encoder input level.
+        mm_hashes_to_schedule = set()
+        num_tokens_to_schedule = 0
         for i, pos_info in enumerate(mm_positions):
             start_pos = pos_info.offset
             num_encoder_tokens = pos_info.length
@@ -695,13 +702,20 @@ class Scheduler(SchedulerInterface):
             if start_pos >= num_computed_tokens + num_new_tokens:
                 # The encoder input is not needed in this step.
                 break
+
             if start_pos + num_encoder_tokens <= num_computed_tokens:
                 # The encoder input is already computed and stored
                 # in the decoder's KV cache.
                 continue
 
+            # The same encoder input has already been scheduled in the current
+            # step.
+            if request.mm_hashes[i] in mm_hashes_to_schedule:
+                continue
+
             if self.encoder_cache_manager.check_and_update_cache(request, i):
-                # The encoder input is already computed and cached.
+                # The encoder input is already computed and cached from a
+                # previous step.
                 continue
 
             # If no encoder input chunking is allowed, we do not want to
@@ -714,8 +728,9 @@ class Scheduler(SchedulerInterface):
                 num_new_tokens = start_pos - num_computed_tokens
                 break
 
-            if not self.encoder_cache_manager.try_allocate(
-                    request, i, encoder_budget):
+            if not self.encoder_cache_manager.can_allocate(
+                    request, i, encoder_compute_budget,
+                    num_tokens_to_schedule):
                 # The encoder cache is full or the encoder budget is exhausted.
                 # NOTE(woosuk): We assume that the encoder input tokens should
                 # be processed altogether, as the encoder usually uses
@@ -732,9 +747,16 @@ class Scheduler(SchedulerInterface):
                     num_new_tokens = 0
                 break
 
-            encoder_budget -= num_encoder_tokens
+            num_tokens_to_schedule += num_encoder_tokens
+            encoder_compute_budget -= num_encoder_tokens
+            mm_hashes_to_schedule.add(request.mm_hashes[i])
             encoder_inputs_to_schedule.append(i)
-        return encoder_inputs_to_schedule, num_new_tokens, encoder_budget
+
+        return (
+            encoder_inputs_to_schedule,
+            num_new_tokens,
+            encoder_compute_budget,
+        )
 
     def get_grammar_bitmask(
         self,

From 50fede6634a997f4e971ecb4eb4cce337340e394 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 26 Aug 2025 18:00:18 +0800
Subject: [PATCH 023/112] [V1] Enable V1 for compute capability < 8.0 + FP32
 (#23614)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/engine/arg_utils.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 3ab1115f14462..f24c50ad73261 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1433,15 +1433,15 @@ class EngineArgs:
                                recommend_to_remove=True)
             return False
 
-        # Need at least Ampere for now (FA support required).
-        # Skip this check if we are running on a non-GPU platform,
-        # or if the device capability is not available
-        # (e.g. in a Ray actor without GPUs).
+        # Triton v3.3 has f16 conversion regression issue on Turing and Volta,
+        # which broke fp16 inference
+        # see: https://github.com/triton-lang/triton/issues/6698
         if (current_platform.is_cuda()
-                and current_platform.get_device_capability()
-                and current_platform.get_device_capability().major < 8):
-            _raise_or_fallback(feature_name="Compute Capability < 8.0",
-                               recommend_to_remove=False)
+                and not current_platform.has_device_capability(80)
+                and model_config.dtype == torch.float16):
+            _raise_or_fallback(
+                feature_name="Compute Capability < 8.0 with FP16",
+                recommend_to_remove=False)
             return False
 
         if self.kv_cache_dtype != "auto":

From b00e69f8ca55f4a82847d39466f57ceb748324c1 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 26 Aug 2025 11:27:20 +0100
Subject: [PATCH 024/112] Fix nits from #20059 (#23548)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/config/compilation.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index e2785e7602e45..56aa00a30d3ae 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -225,7 +225,8 @@ class CompilationConfig:
     # CudaGraph compilation
     cudagraph_mode: Optional[CUDAGraphMode] = None
     """
-    The mode of the cudagraph.
+    The mode of the cudagraph:
+
     - NONE, no cudagraph capture.
     - PIECEWISE. (v1 default)
     - FULL.
@@ -384,13 +385,10 @@ class CompilationConfig:
         if pass_config_exclude:
             exclude["pass_config"] = pass_config_exclude
 
-        # The cast to string is necessary because Pydantic is mocked in docs
-        # builds and sphinx-argparse doesn't know the return type of decode()
-        return str(
-            TypeAdapter(CompilationConfig).dump_json(
-                self,
-                exclude=exclude,  # type: ignore[arg-type]
-                exclude_unset=True).decode())
+        return TypeAdapter(CompilationConfig).dump_json(
+            self,
+            exclude=exclude,  # type: ignore[arg-type]
+            exclude_unset=True).decode()
 
     __str__ = __repr__
 

From 6ace2f72b03fe41475d7d64e2bfd40b79c447f5b Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Tue, 26 Aug 2025 04:16:09 -0700
Subject: [PATCH 025/112] Fix writing benchmark results with tuple keys
 (#23633)

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 vllm/benchmarks/lib/utils.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/benchmarks/lib/utils.py b/vllm/benchmarks/lib/utils.py
index 5f95fdcc75829..0c27687dcf16d 100644
--- a/vllm/benchmarks/lib/utils.py
+++ b/vllm/benchmarks/lib/utils.py
@@ -54,7 +54,12 @@ class InfEncoder(json.JSONEncoder):
 
     def clear_inf(self, o: Any):
         if isinstance(o, dict):
-            return {k: self.clear_inf(v) for k, v in o.items()}
+            return {
+                str(k)
+                if not isinstance(k, (str, int, float, bool, type(None)))
+                else k: self.clear_inf(v)
+                for k, v in o.items()
+            }
         elif isinstance(o, list):
             return [self.clear_inf(v) for v in o]
         elif isinstance(o, float) and math.isinf(o):

From d52358c1e07768266e3db92e847cd28af87ca4b9 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 26 Aug 2025 07:16:33 -0400
Subject: [PATCH 026/112] [Perf] Remove duplicated NVFP4 blockscales to save
 memory (#23379)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .../compressed_tensors_moe.py                 | 20 +++++------
 .../schemes/compressed_tensors_w4a4_nvfp4.py  | 11 +++---
 .../layers/quantization/modelopt.py           | 34 ++++++++-----------
 3 files changed, 30 insertions(+), 35 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 7bc35cd81ac3f..1ee3478aa4f43 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -246,13 +246,13 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
             return
 
         # swizzle weight scales
-        layer.w13_blockscale_swizzled = torch.nn.Parameter(swizzle_blockscale(
+        layer.w13_weight_scale = torch.nn.Parameter(swizzle_blockscale(
             layer.w13_weight_scale),
-                                                           requires_grad=False)
+                                                    requires_grad=False)
 
-        layer.w2_blockscale_swizzled = torch.nn.Parameter(swizzle_blockscale(
+        layer.w2_weight_scale = torch.nn.Parameter(swizzle_blockscale(
             layer.w2_weight_scale),
-                                                          requires_grad=False)
+                                                   requires_grad=False)
 
         # w13
         w13_input_global_scale = layer.w13_input_global_scale.max(
@@ -383,8 +383,8 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
                 activation=activation,
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
-                w1_scale=layer.w13_blockscale_swizzled,
-                w2_scale=layer.w2_blockscale_swizzled,
+                w1_scale=layer.w13_weight_scale,
+                w2_scale=layer.w2_weight_scale,
                 apply_router_weight_on_input=apply_router_weight_on_input,
             )
 
@@ -406,8 +406,8 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
                 activation=activation,
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
-                w1_scale=layer.w13_blockscale_swizzled,
-                w2_scale=layer.w2_blockscale_swizzled,
+                w1_scale=layer.w13_weight_scale,
+                w2_scale=layer.w2_weight_scale,
                 g1_alphas=layer.g1_alphas,
                 g2_alphas=layer.g2_alphas,
                 a1_gscale=layer.w13_input_scale_quant,
@@ -427,8 +427,8 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
             a=x,
             w1_fp4=layer.w13_weight,
             w2_fp4=layer.w2_weight,
-            w1_blockscale=layer.w13_blockscale_swizzled,
-            w2_blockscale=layer.w2_blockscale_swizzled,
+            w1_blockscale=layer.w13_weight_scale,
+            w2_blockscale=layer.w2_weight_scale,
             g1_alphas=layer.g1_alphas,
             g2_alphas=layer.g2_alphas,
             a1_gscale=layer.w13_input_scale_quant,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
index 49d76bbeaa3a1..dedd681f15ded 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
@@ -112,13 +112,12 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
                 torch.uint8), epilogue_tile_m).reshape(
                     weight_scale.shape).view(torch.float8_e4m3fn))
 
-            layer.weight_scale_swizzled = Parameter(weight_scale,
-                                                    requires_grad=False)
+            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
             layer.weight_packed = Parameter(weight, requires_grad=False)
         else:
             swizzled_weight_scale = swizzle_blockscale(layer.weight_scale)
-            layer.weight_scale_swizzled = Parameter(swizzled_weight_scale,
-                                                    requires_grad=False)
+            layer.weight_scale = Parameter(swizzled_weight_scale,
+                                           requires_grad=False)
             layer.weight_packed = Parameter(layer.weight_packed.data,
                                             requires_grad=False)
 
@@ -136,7 +135,7 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
                 x=x,
                 input_global_scale=layer.input_global_scale,
                 weight=layer.weight_packed,
-                weight_scale_swizzled=layer.weight_scale_swizzled,
+                weight_scale_swizzled=layer.weight_scale,
                 weight_global_scale=layer.weight_global_scale)
             if bias is not None:
                 out = out + bias
@@ -149,7 +148,7 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
         x_fp4, x_blockscale = scaled_fp4_quant(x, layer.input_global_scale)
 
         mm_args = (x_fp4, layer.weight_packed, x_blockscale,
-                   layer.weight_scale_swizzled, layer.alpha, output_dtype)
+                   layer.weight_scale, layer.alpha, output_dtype)
         if self.backend == "flashinfer-trtllm":
             out = flashinfer_scaled_fp4_mm(*mm_args, backend="trtllm")
         elif self.backend == "flashinfer-cutlass":
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 046234057f04a..72864853f7e0c 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -907,20 +907,18 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
                 torch.uint8), epilogue_tile_m).reshape(
                     weight_scale.shape).view(torch.float8_e4m3fn))
 
-            layer.weight_scale_swizzled = Parameter(weight_scale,
-                                                    requires_grad=False)
+            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
             layer.weight = Parameter(weight, requires_grad=False)
         else:
             swizzled_weight_scale = swizzle_blockscale(layer.weight_scale)
-            layer.weight_scale_swizzled = Parameter(swizzled_weight_scale,
-                                                    requires_grad=False)
+            layer.weight_scale = Parameter(swizzled_weight_scale,
+                                           requires_grad=False)
             layer.weight = Parameter(layer.weight.data, requires_grad=False)
 
             if self.backend == "marlin":
                 prepare_fp4_layer_for_marlin(layer)
                 del layer.alpha
                 del layer.input_scale
-                del layer.weight_scale_swizzled
 
     def apply(
         self,
@@ -951,14 +949,14 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
         assert (x_fp4.dtype == torch.uint8)
         assert (layer.weight.dtype == torch.uint8)
         assert (x_blockscale.dtype == torch.float8_e4m3fn)
-        assert (layer.weight_scale_swizzled.dtype == torch.float8_e4m3fn)
+        assert (layer.weight_scale.dtype == torch.float8_e4m3fn)
         assert (layer.alpha.dtype == torch.float32)
 
         mm_args = (
             x_fp4,
             layer.weight,
             x_blockscale,
-            layer.weight_scale_swizzled,
+            layer.weight_scale,
             layer.alpha,
             output_dtype,
         )
@@ -1320,16 +1318,16 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 "Weight Blockscale must be represented as FP8-E4M3")
             w13_blockscale_swizzled = swizzle_blockscale(
                 layer.w13_weight_scale)
-            layer.w13_blockscale_swizzled = Parameter(w13_blockscale_swizzled,
-                                                      requires_grad=False)
+            layer.w13_weight_scale = Parameter(w13_blockscale_swizzled,
+                                               requires_grad=False)
 
             assert (layer.w2_weight_scale.shape[2] % 16 == 0), (
                 "Expected weight_scale.dim(1) to be divisible by 16")
             assert (layer.w2_weight_scale.dtype == torch.float8_e4m3fn), (
                 "Weight Blockscale must be represented as FP8-E4M3")
             w2_blockscale_swizzled = swizzle_blockscale(layer.w2_weight_scale)
-            layer.w2_blockscale_swizzled = Parameter(w2_blockscale_swizzled,
-                                                     requires_grad=False)
+            layer.w2_weight_scale = Parameter(w2_blockscale_swizzled,
+                                              requires_grad=False)
             layer.w2_weight = Parameter(layer.w2_weight.data,
                                         requires_grad=False)
 
@@ -1339,8 +1337,6 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
             del layer.g2_alphas
             del layer.w13_input_scale_quant
             del layer.w2_input_scale_quant
-            del layer.w13_blockscale_swizzled
-            del layer.w2_blockscale_swizzled
 
     def apply(
         self,
@@ -1474,8 +1470,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 activation=activation,
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
-                w1_scale=layer.w13_blockscale_swizzled,
-                w2_scale=layer.w2_blockscale_swizzled,
+                w1_scale=layer.w13_weight_scale,
+                w2_scale=layer.w2_weight_scale,
                 apply_router_weight_on_input=apply_router_weight_on_input,
             )
         elif (self.allow_flashinfer
@@ -1489,8 +1485,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 w2=layer.w2_weight,
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
-                w1_scale=layer.w13_blockscale_swizzled,
-                w2_scale=layer.w2_blockscale_swizzled,
+                w1_scale=layer.w13_weight_scale,
+                w2_scale=layer.w2_weight_scale,
                 g1_alphas=layer.g1_alphas,
                 g2_alphas=layer.g2_alphas,
                 a1_gscale=layer.w13_input_scale_quant,
@@ -1510,8 +1506,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 a=x,
                 w1_fp4=layer.w13_weight,
                 w2_fp4=layer.w2_weight,
-                w1_blockscale=layer.w13_blockscale_swizzled,
-                w2_blockscale=layer.w2_blockscale_swizzled,
+                w1_blockscale=layer.w13_weight_scale,
+                w2_blockscale=layer.w2_weight_scale,
                 g1_alphas=layer.g1_alphas,
                 g2_alphas=layer.g2_alphas,
                 a1_gscale=layer.w13_input_scale_quant,

From fdeb3dac132c9ef92d981dd811529e6496781b07 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 26 Aug 2025 20:09:47 +0800
Subject: [PATCH 027/112] [Model] fix DeepSeek e_score_correction_bias dtype to
 fp32 (#23640)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/model_executor/models/deepseek_v2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index d56224b4b7b30..7657e7cb003d6 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -126,7 +126,7 @@ class DeepseekV2MoE(nn.Module):
                                      prefix=f"{prefix}.gate")
         if config.topk_method == "noaux_tc":
             self.gate.e_score_correction_bias = nn.Parameter(
-                torch.empty(config.n_routed_experts))
+                torch.empty(config.n_routed_experts, dtype=torch.float32))
         else:
             self.gate.e_score_correction_bias = None
 

From 384dd1b0a899c6761010b42aefe1159c8062f0a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C3=BA=C5=A1=20N=C3=A1me=C5=A1n=C3=BD?=
 <matus@namesny.com>
Date: Tue, 26 Aug 2025 14:13:15 +0200
Subject: [PATCH 028/112] [Bugfix] Add missing enable_log_outputs parameter to
 init_app_state function (#23634)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Matúš Námešný <matus.namesny@ameria.com>
---
 vllm/entrypoints/openai/api_server.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 14ba8aa641837..db02767fdfd71 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1748,6 +1748,7 @@ async def init_app_state(
         reasoning_parser=args.reasoning_parser,
         enable_prompt_tokens_details=args.enable_prompt_tokens_details,
         enable_force_include_usage=args.enable_force_include_usage,
+        enable_log_outputs=args.enable_log_outputs,
     ) if "generate" in supported_tasks else None
     state.openai_serving_chat = OpenAIServingChat(
         engine_client,
@@ -1765,6 +1766,7 @@ async def init_app_state(
         reasoning_parser=args.reasoning_parser,
         enable_prompt_tokens_details=args.enable_prompt_tokens_details,
         enable_force_include_usage=args.enable_force_include_usage,
+        enable_log_outputs=args.enable_log_outputs,
     ) if "generate" in supported_tasks else None
     state.openai_serving_completion = OpenAIServingCompletion(
         engine_client,

From ebd5a77bb5a6b7643f047f61294da0ce92baf3f6 Mon Sep 17 00:00:00 2001
From: Guillaume Calmettes <gcalmettes@scaleway.com>
Date: Tue, 26 Aug 2025 14:26:26 +0200
Subject: [PATCH 029/112] feat: add usage to TranscriptionResponse (text and
 json response_format) (#23576)

Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
---
 .../openai/test_transcription_validation.py     | 14 ++++++++++----
 vllm/entrypoints/openai/protocol.py             |  6 ++++++
 vllm/entrypoints/openai/speech_to_text.py       | 17 ++++++++++++++++-
 3 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py
index 93239f41a4aeb..6009d9aeec935 100644
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -69,8 +69,11 @@ async def test_basic_audio(mary_had_lamb, model_name):
             language="en",
             response_format="text",
             temperature=0.0)
-        out = json.loads(transcription)['text']
-        assert "Mary had a little lamb," in out
+        out = json.loads(transcription)
+        out_text = out['text']
+        out_usage = out['usage']
+        assert "Mary had a little lamb," in out_text
+        assert out_usage["seconds"] == 16, out_usage["seconds"]
 
 
 @pytest.mark.asyncio
@@ -116,9 +119,12 @@ async def test_long_audio_request(mary_had_lamb, client):
         language="en",
         response_format="text",
         temperature=0.0)
-    out = json.loads(transcription)['text']
-    counts = out.count("Mary had a little lamb")
+    out = json.loads(transcription)
+    out_text = out['text']
+    out_usage = out['usage']
+    counts = out_text.count("Mary had a little lamb")
     assert counts == 10, counts
+    assert out_usage["seconds"] == 161, out_usage["seconds"]
 
 
 @pytest.mark.asyncio
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index a3d7b78cf4552..5cb41bd93d4bc 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -2232,9 +2232,15 @@ class TranscriptionRequest(OpenAIBaseModel):
 
 
 # Transcription response objects
+class TranscriptionUsageAudio(OpenAIBaseModel):
+    type: Literal["duration"] = "duration"
+    seconds: int
+
+
 class TranscriptionResponse(OpenAIBaseModel):
     text: str
     """The transcribed text."""
+    usage: TranscriptionUsageAudio
 
 
 class TranscriptionWord(OpenAIBaseModel):
diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
index 01140a4bfea7e..de2619a78f8e0 100644
--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@@ -200,7 +200,22 @@ class OpenAISpeechToText(OpenAIServing):
             for result_generator in list_result_generator:
                 async for op in result_generator:
                     text += op.outputs[0].text
-            return cast(T, response_class(text=text))
+
+            if self.task_type == "transcribe":
+                # add usage in TranscriptionResponse.
+                usage = {
+                    "type": "duration",
+                    # rounded up as per openAI specs
+                    "seconds": int(math.ceil(duration_s)),
+                }
+                final_response = cast(T, response_class(text=text,
+                                                        usage=usage))
+            else:
+                # no usage in response for translation task
+                final_response = cast(
+                    T, response_class(text=text))  # type: ignore[call-arg]
+
+            return final_response
         except asyncio.CancelledError:
             return self.create_error_response("Client disconnected")
         except ValueError as e:

From 2b4fc9bd9b8321265ff54065ea47bd9e327c6b6f Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Tue, 26 Aug 2025 05:41:52 -0700
Subject: [PATCH 030/112] Support FlashAttention Backend for Hybrid SSM Models
 (#23299)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 .../models/language/generation/test_hybrid.py |  3 --
 vllm/v1/worker/gpu_model_runner.py            | 41 ++++++++-----------
 2 files changed, 17 insertions(+), 27 deletions(-)

diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index 2055c44c83cda..7e7cc893ec8aa 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -110,9 +110,6 @@ def test_models(
     if model in V1_SUPPORTED_MODELS:
         with monkeypatch.context() as m:
             m.setenv("VLLM_USE_V1", "1")
-            if model in HYBRID_MODELS:
-                # required due to reorder_batch behaviour
-                m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
             with vllm_runner(model,
                              max_num_seqs=MAX_NUM_SEQS,
                              enable_prefix_caching=False) as vllm_model:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 4f6cf9a350706..14f2305dadc54 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -3023,40 +3023,33 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     raise NotImplementedError
 
         if has_attn and has_mamba:
-            self._verify_hybrid_attention_mamba_layout(kv_cache_config,
-                                                       kv_cache_raw_tensors)
+            self._update_hybrid_attention_mamba_layout(kv_caches)
 
         return kv_caches
 
-    def _verify_hybrid_attention_mamba_layout(
-            self, kv_cache_config: KVCacheConfig,
-            kv_cache_raw_tensors: dict[str, torch.Tensor]) -> None:
+    def _update_hybrid_attention_mamba_layout(
+            self, kv_caches: dict[str, torch.Tensor]) -> None:
         """
-        Verify that the KV cache memory layout is compatible for
-        models with both attention and mamba KV cache groups.
+        Update the layout of attention layers from (2, num_blocks, ...) to
+        (num_blocks, 2, ...).
 
         Args:
-            kv_cache_config: The KV cache config
-            kv_cache_raw_tensors: The KV cache buffer of each layer.
+            kv_caches: The KV cache buffer of each layer.
         """
 
         for kv_cache_spec, group in self._kv_cache_spec_attn_group_iterator():
             for layer_name in group.layer_names:
-                raw_tensor = kv_cache_raw_tensors[layer_name]
-                num_blocks = (raw_tensor.numel() //
-                              kv_cache_spec.page_size_bytes)
-                if isinstance(kv_cache_spec, AttentionSpec):
-
-                    kv_cache_shape = group.backend.get_kv_cache_shape(
-                        num_blocks, kv_cache_spec.block_size,
-                        kv_cache_spec.num_kv_heads, kv_cache_spec.head_size)
-                    if kv_cache_shape[0] != num_blocks or kv_cache_shape[
-                            1] != 2:
-                        raise ValueError(
-                            "Hybrid models in V1 require an attention "
-                            "backend with kv_cache_shape="
-                            "(num_blocks, 2, ...). Please try setting "
-                            "VLLM_ATTENTION_BACKEND=FLASHINFER")
+                kv_cache = kv_caches[layer_name]
+                if (isinstance(kv_cache_spec, AttentionSpec)
+                        and kv_cache.shape[0] == 2):
+                    assert kv_cache.shape[1] != 2, \
+                        "Fail to determine whether the layout is " \
+                        "(2, num_blocks, ...) or (num_blocks, 2, ...) for " \
+                        f"a tensor of shape {kv_cache.shape}"
+                    hidden_size = kv_cache.shape[2:].numel()
+                    kv_cache.as_strided_(size=kv_cache.shape,
+                                         stride=(hidden_size, 2 * hidden_size,
+                                                 *kv_cache.stride()[2:]))
 
     def initialize_kv_cache_tensors(
             self, kv_cache_config: KVCacheConfig) -> dict[str, torch.Tensor]:

From 164b2273c87ad72b2d3b1f2762367de42d6e946b Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 26 Aug 2025 14:00:18 +0100
Subject: [PATCH 031/112] [Docs] Fix broken links to `docs/api/summary.md`
 (#23637)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/examples/README.md          | 6 +++---
 docs/models/generative_models.md | 2 +-
 docs/models/pooling_models.md    | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/examples/README.md b/docs/examples/README.md
index 34e4dfd408a20..3cf93027f4209 100644
--- a/docs/examples/README.md
+++ b/docs/examples/README.md
@@ -2,6 +2,6 @@
 
 vLLM's examples are split into three categories:
 
-- If you are using vLLM from within Python code, see [Offline Inference](./offline_inference/)
-- If you are using vLLM from an HTTP application or client, see [Online Serving](./online_serving/)
-- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see [Others](./others/)
+- If you are using vLLM from within Python code, see [Offline Inference](./offline_inference)
+- If you are using vLLM from an HTTP application or client, see [Online Serving](./online_serving)
+- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see [Others](./others)
diff --git a/docs/models/generative_models.md b/docs/models/generative_models.md
index a64ecd31ebaef..d02522a6657de 100644
--- a/docs/models/generative_models.md
+++ b/docs/models/generative_models.md
@@ -19,7 +19,7 @@ Run a model in generation mode via the option `--runner generate`.
 ## Offline Inference
 
 The [LLM][vllm.LLM] class provides various methods for offline inference.
-See [configuration](../api/summary.md#configuration) for a list of options when initializing the model.
+See [configuration](../api/README.md#configuration) for a list of options when initializing the model.
 
 ### `LLM.generate`
 
diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index 753d8bd0b8339..fbb5f6f6dd171 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -81,7 +81,7 @@ which takes priority over both the model's and Sentence Transformers's defaults.
 ## Offline Inference
 
 The [LLM][vllm.LLM] class provides various methods for offline inference.
-See [configuration](../api/summary.md#configuration) for a list of options when initializing the model.
+See [configuration](../api/README.md#configuration) for a list of options when initializing the model.
 
 ### `LLM.embed`
 

From b78bed1bc5debead116092f429eee51398691fc8 Mon Sep 17 00:00:00 2001
From: En Ouyang <en.ouyang93@outlook.com>
Date: Tue, 26 Aug 2025 21:04:25 +0800
Subject: [PATCH 032/112] [Hardware][Mac] Fix the installation fail for Apple
 Silicon (CPU)  (#23565)

Signed-off-by: oye93 <en.ouyang93@outlook.com>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
---
 cmake/cpu_extension.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index cc38cd41a5b24..52bfd82c7fcfe 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -1,6 +1,7 @@
 include(FetchContent)
 
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_EXTENSIONS ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 

From f66673a39d9f364194c249f28098cad8a5584ccb Mon Sep 17 00:00:00 2001
From: nvjullin <jullin@nvidia.com>
Date: Tue, 26 Aug 2025 21:54:04 +0800
Subject: [PATCH 033/112] [Kernel] Added flashinfer fp8 per-tensor gemms
 (#22895)

Signed-off-by: Julien Lin <jullin@nvidia.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |  1 +
 tests/compile/test_fusion.py                  | 15 ++--
 tests/compile/test_sequence_parallelism.py    |  3 +-
 tests/compile/test_silu_mul_quant_fusion.py   | 13 ++--
 .../quantization/test_flashinfer_scaled_mm.py | 73 +++++++++++++++++++
 .../model_executor/layers/quantization/fp8.py |  5 +-
 .../layers/quantization/ptpc_fp8.py           |  4 +-
 .../layers/quantization/utils/w8a8_utils.py   | 59 +++++++++++----
 vllm/utils/flashinfer.py                      | 61 ++++++++++++++++
 9 files changed, 198 insertions(+), 36 deletions(-)
 create mode 100644 tests/kernels/quantization/test_flashinfer_scaled_mm.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 1ccfa93c571ce..0d3b7a294d963 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -655,6 +655,7 @@ steps:
     - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
     - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
     - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
     - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
     - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
     - pytest -v -s tests/kernels/moe/test_mxfp4_moe.py
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index 5cfad935a0fb1..c4229f93464ac 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -15,7 +15,7 @@ from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     GroupShape, QuantKey, ScaleDesc)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    CUTLASS_FP8_SUPPORTED, Fp8LinearOp, maybe_create_device_identity)
+    Fp8LinearOp, maybe_create_device_identity)
 from vllm.platforms import current_platform
 
 from .backend import TestBackend
@@ -26,9 +26,9 @@ FP8_DTYPE = current_platform.fp8_dtype()
 class TestModel(torch.nn.Module):
 
     def __init__(self, hidden_size: int, eps: float, static: bool,
-                 cutlass_fp8_enabled: bool, *args, **kwargs):
+                 force_fp8_e4m3fnuz: bool, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.cutlass_fp8_enabled = cutlass_fp8_enabled
+        self.force_fp8_e4m3fnuz = force_fp8_e4m3fnuz
         self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)]
         self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
         group_shape = GroupShape.PER_TENSOR if static else GroupShape.PER_TOKEN
@@ -43,7 +43,7 @@ class TestModel(torch.nn.Module):
             for _ in range(2)
         ]
         self.fp8_linear = Fp8LinearOp(
-            cutlass_fp8_supported=cutlass_fp8_enabled,
+            force_fp8_e4m3fnuz=force_fp8_e4m3fnuz,
             act_quant_static=static,
             act_quant_group_shape=group_shape,
         )
@@ -81,12 +81,11 @@ class TestModel(torch.nn.Module):
 @pytest.mark.parametrize("num_tokens", [7, 256, 533, 2048, 2049])
 @pytest.mark.parametrize("eps", [1e-5, 1e-6])
 @pytest.mark.parametrize("static", [True, False])
-@pytest.mark.parametrize("cutlass_fp8_enabled",
-                         [True, False] if CUTLASS_FP8_SUPPORTED else [False])
+@pytest.mark.parametrize("force_fp8_e4m3fnuz", [True, False])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"],
                     reason="Only test on CUDA and ROCm")
 def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
-                              cutlass_fp8_enabled):
+                              force_fp8_e4m3fnuz):
     torch.set_default_device("cuda")
     torch.set_default_dtype(dtype)
     torch.manual_seed(1)
@@ -103,7 +102,7 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
         fusion_pass = FusionPass.instance(vllm_config)
 
         backend = TestBackend(noop_pass, fusion_pass)
-        model = TestModel(hidden_size, eps, static, cutlass_fp8_enabled)
+        model = TestModel(hidden_size, eps, static, force_fp8_e4m3fnuz)
 
         # First dimension dynamic
         x = torch.rand(num_tokens, hidden_size)
diff --git a/tests/compile/test_sequence_parallelism.py b/tests/compile/test_sequence_parallelism.py
index a6baa97fe6990..fb9f9dde22799 100644
--- a/tests/compile/test_sequence_parallelism.py
+++ b/tests/compile/test_sequence_parallelism.py
@@ -104,8 +104,7 @@ class TestQuantModel(torch.nn.Module):
         # Initialize weights
         torch.nn.init.normal_(self.gate_proj, std=0.02)
 
-        self.fp8_linear = Fp8LinearOp(cutlass_fp8_supported=True,
-                                      use_per_token_if_dynamic=False)
+        self.fp8_linear = Fp8LinearOp(use_per_token_if_dynamic=False)
 
         self.scale = torch.rand(1, dtype=torch.float32)
         # Create a weight that is compatible with torch._scaled_mm,
diff --git a/tests/compile/test_silu_mul_quant_fusion.py b/tests/compile/test_silu_mul_quant_fusion.py
index 5351a3cf35ba5..0e1059e654479 100644
--- a/tests/compile/test_silu_mul_quant_fusion.py
+++ b/tests/compile/test_silu_mul_quant_fusion.py
@@ -12,7 +12,7 @@ from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     GroupShape)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    CUTLASS_FP8_SUPPORTED, Fp8LinearOp)
+    Fp8LinearOp)
 from vllm.platforms import current_platform
 
 from .backend import TestBackend
@@ -20,7 +20,7 @@ from .backend import TestBackend
 
 class TestModel(torch.nn.Module):
 
-    def __init__(self, hidden_size: int, cutlass_fp8_enabled: bool, *args,
+    def __init__(self, hidden_size: int, force_fp8_e4m3fnuz: bool, *args,
                  **kwargs):
         super().__init__(*args, **kwargs)
         self.silu_and_mul = SiluAndMul()
@@ -32,7 +32,7 @@ class TestModel(torch.nn.Module):
             hidden_size).to(dtype=current_platform.fp8_dtype()).t())
 
         self.fp8_linear = Fp8LinearOp(
-            cutlass_fp8_supported=cutlass_fp8_enabled,
+            force_fp8_e4m3fnuz=force_fp8_e4m3fnuz,
             act_quant_static=True,
             act_quant_group_shape=GroupShape.PER_TENSOR,
         )
@@ -48,12 +48,11 @@ class TestModel(torch.nn.Module):
 
 @pytest.mark.parametrize("num_tokens", [256])
 @pytest.mark.parametrize("hidden_size", [64])
-@pytest.mark.parametrize("cutlass_fp8_enabled",
-                         [True, False] if CUTLASS_FP8_SUPPORTED else [False])
+@pytest.mark.parametrize("force_fp8_e4m3fnuz", [True, False])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"],
                     reason="Only test on CUDA and ROCm")
 def test_fusion_silu_and_mul_quant(num_tokens, hidden_size,
-                                   cutlass_fp8_enabled):
+                                   force_fp8_e4m3fnuz):
     torch.set_default_device("cuda")
     torch.set_default_dtype(torch.float16)
 
@@ -64,7 +63,7 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size,
     fusion_pass = ActivationQuantFusionPass(config)
 
     backend = TestBackend(NoOpEliminationPass(config), fusion_pass)
-    model = TestModel(hidden_size, cutlass_fp8_enabled)
+    model = TestModel(hidden_size, force_fp8_e4m3fnuz)
 
     # First dimension dynamic
     x = torch.rand(num_tokens, hidden_size * 2)
diff --git a/tests/kernels/quantization/test_flashinfer_scaled_mm.py b/tests/kernels/quantization/test_flashinfer_scaled_mm.py
new file mode 100644
index 0000000000000..9f669c6df8bd5
--- /dev/null
+++ b/tests/kernels/quantization/test_flashinfer_scaled_mm.py
@@ -0,0 +1,73 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+from vllm.utils.flashinfer import flashinfer_scaled_fp8_mm
+
+if not current_platform.has_device_capability(100):
+    pytest.skip(
+        reason=
+        "Flashinfer FP8 gemms requires compute capability of 10.0 or above.",
+        allow_module_level=True,
+    )
+
+DTYPES = [torch.float16, torch.bfloat16]
+# m, n, k
+SHAPES = [(128, 128, 64), (128, 128, 128), (256, 128, 64), (128, 256, 128)]
+PAD_SHAPES = [(150, 128, 64), (128, 128, 96)]
+SHAPES.extend(PAD_SHAPES)
+
+SEEDS = [42]
+CUDA_DEVICES = ["cuda:0"]
+
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("use_bias", [True, False])
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("autotune", [False, True])
+@torch.inference_mode()
+def test_flashinfer_fp8_gemm(
+    dtype: torch.dtype,
+    shape: tuple[int, int, int],
+    use_bias: bool,
+    seed: int,
+    device: str,
+    autotune: bool,
+) -> None:
+    current_platform.seed_everything(seed)
+    m, n, k = shape
+    a = torch.randn((m, k), dtype=dtype, device=device)
+    b = torch.randn((n, k), dtype=dtype, device=device) / k
+
+    a_fp8, a_scale = ops.scaled_fp8_quant(a)
+    b_fp8, b_scale = ops.scaled_fp8_quant(b)
+
+    expected_out = torch.mm(
+        a_scale * a_fp8.to(dtype=torch.float32),
+        b_scale * b_fp8.to(dtype=torch.float32).t(),
+    ).to(dtype=dtype)
+
+    if use_bias:
+        bias = torch.randn((n, ), dtype=dtype, device=device)
+        expected_out = expected_out + bias
+    else:
+        bias = None
+
+    import flashinfer
+
+    with flashinfer.autotune(autotune):
+        out = flashinfer_scaled_fp8_mm(
+            a_fp8,
+            b_fp8.t(),
+            a_scale,
+            b_scale,
+            dtype,
+            bias=bias,
+        )
+
+    torch.testing.assert_close(out, expected_out, atol=1e-2, rtol=1e-2)
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index a4de4d7094c30..d45d368b582df 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -223,8 +223,7 @@ class Fp8LinearMethod(LinearMethodBase):
 
         self.fp8_linear = Fp8LinearOp(
             act_quant_static=self.act_q_static,
-            act_quant_group_shape=self.act_q_group_shape,
-            cutlass_fp8_supported=cutlass_fp8_supported())
+            act_quant_group_shape=self.act_q_group_shape)
 
     def create_weights(
         self,
@@ -376,6 +375,8 @@ class Fp8LinearMethod(LinearMethodBase):
             # Update the layer with the new values.
             layer.weight = Parameter(qweight.t(), requires_grad=False)
             layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+            # layer.input_scale is None indicates dynamic quant and scale is
+            # computed from input.
             layer.input_scale = None
 
         # If checkpoint is fp8, handle that there are N scales for N
diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py
index d11cba2caba88..466fd5fba7685 100644
--- a/vllm/model_executor/layers/quantization/ptpc_fp8.py
+++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py
@@ -97,8 +97,8 @@ class PTPCFp8LinearMethod(Fp8LinearMethod):
         self.quant_config.is_checkpoint_fp8_serialized = False
         self.fp8_linear = Fp8LinearOp(
             act_quant_static=False,
-            cutlass_fp8_supported=False,
-            act_quant_group_shape=GroupShape.PER_TOKEN)
+            act_quant_group_shape=GroupShape.PER_TOKEN,
+            force_fp8_e4m3fnuz=True)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         layer.weight = torch.nn.Parameter(layer.weight.data,
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 36d16960ec57c..5333bbd310ff9 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -14,6 +14,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
     GroupShape)
 from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
+from vllm.utils.flashinfer import flashinfer_scaled_fp8_mm, has_flashinfer
 
 # Input scaling factors are no longer optional in _scaled_mm starting
 # from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
@@ -157,6 +158,19 @@ def cutlass_w8a8_scaled_mm(*, qinput: torch.Tensor, weight: torch.Tensor,
     return output.view(*output_shape)
 
 
+def flashinfer_w8a8_scaled_mm(*, qinput: torch.Tensor, weight: torch.Tensor,
+                              out_dtype: torch.dtype, scale_a: torch.Tensor,
+                              scale_b: torch.Tensor, bias: torch.Tensor,
+                              output_shape: list, **kwargs) -> torch.Tensor:
+
+    return flashinfer_scaled_fp8_mm(qinput,
+                                    weight,
+                                    out_dtype=out_dtype,
+                                    scale_a=scale_a,
+                                    scale_b=scale_b,
+                                    bias=bias)
+
+
 def rocm_per_tensor_w8a8_scaled_mm_impl(
         qinput: torch.Tensor, weight: torch.Tensor, out_dtype: torch.dtype,
         scale_a: torch.Tensor, scale_b: torch.Tensor, bias: torch.Tensor,
@@ -231,8 +245,8 @@ def torch_per_token_w8a8_scaled_mm(*, qinput: torch.Tensor,
                                    out_dtype: torch.dtype,
                                    scale_a: torch.Tensor,
                                    scale_b: torch.Tensor, bias: torch.Tensor,
-                                   input_2d: torch.Tensor,
-                                   output_shape: list) -> torch.Tensor:
+                                   input_2d: torch.Tensor, output_shape: list,
+                                   **kwargs) -> torch.Tensor:
     # Note: Callers of this function should check USE_ROWWISE_TORCH_SCALED_MM
     #  when using it.
     #  For now it has only been validated on ROCm platform.
@@ -303,16 +317,22 @@ def torch_channelwise_w8a8_scaled_mm(*, qinput: torch.Tensor,
 
 
 def dispatch_w8a8_scaled_mm(
-        cutlass_fp8_supported: bool, per_tensor_weights: bool,
+        preferred_backend: str, per_tensor_weights: bool,
         per_tensor_activations: bool) -> Callable[..., torch.Tensor]:
 
-    # cutlass_scaled_mm supports per tensor/channel W and per tensor/token A
-    if cutlass_fp8_supported:
-        return cutlass_w8a8_scaled_mm
     if per_tensor_weights and per_tensor_activations:
-        if current_platform.is_rocm():
+        if preferred_backend == "rocm":
             return rocm_per_tensor_w8a8_scaled_mm
+        if preferred_backend == "flashinfer":
+            return flashinfer_w8a8_scaled_mm
+        if preferred_backend == "cutlass":
+            return cutlass_w8a8_scaled_mm
         return torch_per_tensor_w8a8_scaled_mm
+
+    # cutlass_scaled_mm supports per tensor/channel W and per tensor/token A
+    if preferred_backend == "cutlass" or preferred_backend == "flashinfer":
+        return cutlass_w8a8_scaled_mm
+
     # If torch.scaled_mm supports per-channel (weights) per-token (inputs)
     if not per_tensor_weights and not per_tensor_activations \
             and USE_ROWWISE_TORCH_SCALED_MM:
@@ -334,10 +354,20 @@ class Fp8LinearOp:
 
     def __init__(self,
                  act_quant_static: bool,
-                 cutlass_fp8_supported: bool = cutlass_fp8_supported(),
                  act_quant_group_shape: GroupShape = GroupShape.PER_TENSOR,
-                 pad_output: Optional[bool] = None):
-        self.cutlass_fp8_supported = cutlass_fp8_supported
+                 pad_output: Optional[bool] = None,
+                 force_fp8_e4m3fnuz: bool = False):
+        if current_platform.is_rocm():
+            self.preferred_backend = "rocm"
+        elif current_platform.is_cuda(
+        ) and not force_fp8_e4m3fnuz and cutlass_fp8_supported():
+            if has_flashinfer() and current_platform.has_device_capability(
+                    100):
+                self.preferred_backend = "flashinfer"
+            else:
+                self.preferred_backend = "cutlass"
+        else:
+            self.preferred_backend = "torch"
 
         # Note: we pad the input because torch._scaled_mm is more performant
         # for matrices with batch dimension > 16.
@@ -347,8 +377,7 @@ class Fp8LinearOp:
         if pad_output is None:
             config = get_current_vllm_config().compilation_config
             pad_output = config.level < CompilationLevel.PIECEWISE and \
-                         not cutlass_fp8_supported and \
-                         not current_platform.is_rocm()
+                         self.preferred_backend == "torch"
 
         self.output_padding = 17 if pad_output else None
         self.act_quant_static = act_quant_static
@@ -393,9 +422,9 @@ class Fp8LinearOp:
         per_tensor_activations = (x_scale.numel() == 1)
 
         # TODO(luka) do this dispatch during init (after ScaledMM refactor)
-        w8a8_scaled_mm_func = dispatch_w8a8_scaled_mm(
-            self.cutlass_fp8_supported, per_tensor_weights,
-            per_tensor_activations)
+        w8a8_scaled_mm_func = dispatch_w8a8_scaled_mm(self.preferred_backend,
+                                                      per_tensor_weights,
+                                                      per_tensor_activations)
 
         return w8a8_scaled_mm_func(qinput=qinput,
                                    weight=weight,
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 5dd239c50f637..fab134733d4fd 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -265,6 +265,37 @@ if has_flashinfer():
                            dtype=dtype,
                            device=A.device)
 
+    @torch.library.custom_op(
+        "vllm::bmm_fp8",
+        mutates_args=[],
+        device_types="cuda",
+    )
+    def bmm_fp8(
+        A: torch.Tensor,
+        B: torch.Tensor,
+        A_scale: torch.Tensor,
+        B_scale: torch.Tensor,
+        dtype: torch.dtype,
+        backend: str,
+    ) -> torch.Tensor:
+        from flashinfer import bmm_fp8 as bmm_fp8_
+        return bmm_fp8_(A, B, A_scale, B_scale, dtype, None, backend)
+
+    @torch.library.register_fake("vllm::bmm_fp8", )
+    def bmm_fp8_fake(
+        A: torch.Tensor,
+        B: torch.Tensor,
+        A_scale: torch.Tensor,
+        B_scale: torch.Tensor,
+        dtype: torch.dtype,
+        backend: str,
+    ) -> torch.Tensor:
+        return torch.empty(A.shape[0],
+                           A.shape[1],
+                           B.shape[2],
+                           dtype=dtype,
+                           device=A.device)
+
 
 def flashinfer_scaled_fp4_mm(a: torch.Tensor, b: torch.Tensor,
                              block_scale_a: torch.Tensor,
@@ -293,6 +324,35 @@ def flashinfer_scaled_fp4_mm(a: torch.Tensor, b: torch.Tensor,
     )
 
 
+def flashinfer_scaled_fp8_mm(
+        a: torch.Tensor,
+        b: torch.Tensor,
+        scale_a: torch.Tensor,
+        scale_b: torch.Tensor,
+        out_dtype: torch.dtype,
+        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    assert a.ndim == 2 and b.ndim == 2
+    assert a.shape[1] == b.shape[0]
+    assert scale_a.numel() == 1 and scale_b.numel() == 1
+    assert a.dtype == torch.float8_e4m3fn and b.dtype == torch.float8_e4m3fn
+    assert a.device.type == "cuda" and b.device.type == "cuda"
+    assert scale_a.dtype == torch.float32 and scale_b.dtype == torch.float32
+    assert scale_a.device.type == "cuda" and scale_b.device.type == "cuda"
+
+    output = bmm_fp8(
+        a.unsqueeze(0),
+        b.unsqueeze(0),
+        scale_a,
+        scale_b,
+        out_dtype,
+        "auto",
+    ).view(a.shape[0], b.shape[1])
+
+    if bias is not None:
+        output = output + bias
+    return output
+
+
 __all__ = [
     "has_flashinfer",
     "flashinfer_trtllm_fp8_block_scale_moe",
@@ -307,4 +367,5 @@ __all__ = [
     "supports_trtllm_attention",
     "use_trtllm_attention",
     "flashinfer_scaled_fp4_mm",
+    "flashinfer_scaled_fp8_mm",
 ]

From 7c04779afa7d0811dba3e1ec98c0ac1bc56570be Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Tue, 26 Aug 2025 16:05:29 +0200
Subject: [PATCH 034/112] [Doc]: fix various spelling issues in multiple files
 (#23636)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
---
 .buildkite/nightly-benchmarks/README.md | 2 +-
 benchmarks/README.md                    | 2 +-
 docs/configuration/optimization.md      | 4 ++--
 docs/configuration/tpu.md               | 2 +-
 docs/design/fused_moe_modular_kernel.md | 6 +++---
 vllm/distributed/kv_transfer/README.md  | 4 ++--
 6 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
index b39f9899a8f28..e6f5c8b60f459 100644
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@@ -141,7 +141,7 @@ When run, benchmark script generates results under `benchmark/results` folder, a
 `compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.  
 If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
 
-Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output lenght, max concurrency and qps.
+Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output length, max concurrency and qps.
 `python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
 
 |   | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps  | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio        |
diff --git a/benchmarks/README.md b/benchmarks/README.md
index a2dd5bb58325c..38072152b653b 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -749,7 +749,7 @@ vllm serve Qwen/Qwen2.5-VL-3B-Instruct \
 
 Benchmark. It is recommended to use the flag `--ignore-eos` to simulate real responses. You can set the size of the output via the arg `random-output-len`.
 
-Ex.1: Fixed number of items and a single image resolutionm, enforcing generation of approx 40 tokens:
+Ex.1: Fixed number of items and a single image resolution, enforcing generation of approx 40 tokens:
 
 ```bash
 vllm bench serve \
diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
index 6c7c31f503c15..bb47e1b90f086 100644
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -168,7 +168,7 @@ llm = LLM(
     Batch-level DP is not to be confused with API request-level DP
     (which is instead controlled by `data_parallel_size`).
 
-The availablilty of batch-level DP is based on model implementation.
+The availability of batch-level DP is based on model implementation.
 Currently, the following models support `mm_encoder_tp_mode="data"`:
 
 - Llama4 (<gh-pr:18368>)
@@ -205,7 +205,7 @@ vllm serve Qwen/Qwen2.5-VL-3B-Instruct --api-server-count 4 -dp 2
 
 !!! note
     [Multi-modal processor cache](#processor-cache) is disabled when API server scale-out is enabled
-    because it requires a one-to-one correspondance between API and engine core processes.
+    because it requires a one-to-one correspondence between API and engine core processes.
 
 ## Multi-Modal Caching
 
diff --git a/docs/configuration/tpu.md b/docs/configuration/tpu.md
index a93435ed71b50..ac2b6baffd14e 100644
--- a/docs/configuration/tpu.md
+++ b/docs/configuration/tpu.md
@@ -70,7 +70,7 @@ For example, max_model_len=512, padding_gap=64, the buckets will be [16, 32, 64,
 
 The fewer tokens we pad, the less unnecessary computation TPU does, the better performance we can get. For example, if num_tokens=300, with exponential padding, we pad to 512, with the bucket_padding above, we pad to 320.
 
-However, you need to be careful to choose the padding gap. If the gap is too small, it means the number of buckets is large, leading to increased warmup (precompile) time and higher memory to store the compiled graph. Too many compilaed graphs may lead to HBM OOM. Conversely, an overly large gap yields no performance improvement compared to the default exponential padding.
+However, you need to be careful to choose the padding gap. If the gap is too small, it means the number of buckets is large, leading to increased warmup (precompile) time and higher memory to store the compiled graph. Too many compiled graphs may lead to HBM OOM. Conversely, an overly large gap yields no performance improvement compared to the default exponential padding.
 
 #### Quantization
 
diff --git a/docs/design/fused_moe_modular_kernel.md b/docs/design/fused_moe_modular_kernel.md
index 3c4c7d2102170..202e9c1caf113 100644
--- a/docs/design/fused_moe_modular_kernel.md
+++ b/docs/design/fused_moe_modular_kernel.md
@@ -133,7 +133,7 @@ class FusedMoEModularKernel:
 Typically a FusedMoEPrepareAndFinalize type is backed by an All2All Dispatch & Combine implementation / kernel. For example,
 
 * PplxPrepareAndFinalize type is backed by Pplx All2All kernels,
-* DeepEPHTPrepareAndFinalize type is backed by DeepEP High-Throughtput All2All kernels, and
+* DeepEPHTPrepareAndFinalize type is backed by DeepEP High-Throughput All2All kernels, and
 * DeepEPLLPrepareAndFinalize type is backed by DeepEP Low-Latency All2All kernels.
 
 #### Step 1: Add an All2All manager
@@ -183,7 +183,7 @@ implementations that input `FusedMoEActivationFormat.Standard` support chunking
 
 #### maybe_make_prepare_finalize
 
-The `maybe_make_prepare_finalize` method is responsbile for constructing an instance of `FusedMoEPrepareAndFinalize` when appropriate based on the current all2all backend, e.g. when EP + DP is enabled.  The base class method currently constructs all the `FusedMoEPrepareAndFinalize` objects for the EP+DP case.  Derived classes can override this method to construct prepare/finalize objects for different scenarios, e.g. `ModelOptNvFp4FusedMoE` can construct a `FlashInferCutlassMoEPrepareAndFinalize` for the EP+TP case.
+The `maybe_make_prepare_finalize` method is responsible for constructing an instance of `FusedMoEPrepareAndFinalize` when appropriate based on the current all2all backend, e.g. when EP + DP is enabled.  The base class method currently constructs all the `FusedMoEPrepareAndFinalize` objects for the EP+DP case.  Derived classes can override this method to construct prepare/finalize objects for different scenarios, e.g. `ModelOptNvFp4FusedMoE` can construct a `FlashInferCutlassMoEPrepareAndFinalize` for the EP+TP case.
 Please refer to the implementations in,
 
 * `ModelOptNvFp4FusedMoE`
@@ -198,7 +198,7 @@ Please refer to the implementations in,
 * `CompressedTensorsW8A8Fp8MoECutlassMethod`
 * `Fp8MoEMethod`
 * `ModelOptNvFp4FusedMoE`
-dervied classes.
+derived classes.
 
 #### init_prepare_finalize
 
diff --git a/vllm/distributed/kv_transfer/README.md b/vllm/distributed/kv_transfer/README.md
index 349d3dfbd84fc..39377aabcce3a 100644
--- a/vllm/distributed/kv_transfer/README.md
+++ b/vllm/distributed/kv_transfer/README.md
@@ -2,7 +2,7 @@
 # Distributed KV cache transfer
 
 This folder implements distributed KV cache transfer across vLLM instances.
-Currently the main usecase is for disaggregated prefilling.
+Currently the main use case is for disaggregated prefilling.
 
 ## Abstractions
 
@@ -14,7 +14,7 @@ The KV cache transfer contains three layer of abstractions:
 
 Why we need KV lookup buffer: FIFO pipe itself is not enough as prefill vLLM worker may process requests in a different order compared to decode vLLM worker. Say the QPS is really high, prefill worker may handle requests in order A -> B -> C, but the decode worker may process request C first. This is not the case that can be naturally handled by FIFO pipe, so we provide KV lookup buffer to help translate a FIFO pipe to a lookup buffer.
 
-NOTE: KV pipe layer is bypassible: you can skip this layer if your distributed
+NOTE: KV pipe layer is bypassable: you can skip this layer if your distributed
 communication service already supports key-value-based lookup (like redis or
 RDMA database).
 

From f58675bfb36b67cdbca4d2356a2f580e7a706ec3 Mon Sep 17 00:00:00 2001
From: TianyuLi0 <116711075+TianyuLi0@users.noreply.github.com>
Date: Tue, 26 Aug 2025 22:09:17 +0800
Subject: [PATCH 035/112] [CPU] add cpu fused moe pytorch native implementation
 (#23146)

Signed-off-by: Tianyu Li <tianyu.li@arm.com>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
---
 .../layers/fused_moe/cpu_fused_moe.py         | 286 +++++++++++-------
 vllm/model_executor/layers/fused_moe/layer.py |   4 +-
 2 files changed, 180 insertions(+), 110 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
index e67ff66882102..769a04b7de89d 100644
--- a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
@@ -3,10 +3,110 @@
 from typing import Callable, Optional
 
 import torch
+from torch.nn import functional as F
 
 from vllm import envs
 
 
+def silu_and_mul(x: torch.Tensor) -> torch.Tensor:
+    d = x.shape[-1] // 2
+    return F.silu(x[..., :d]) * x[..., d:]
+
+
+def grouped_topk(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    num_expert_group: int = 0,
+    topk_group: int = 0,
+    scoring_func: str = "softmax",
+    e_score_correction_bias: Optional[torch.Tensor] = None
+) -> tuple[torch.Tensor, torch.Tensor]:
+    assert hidden_states.shape[0] == gating_output.shape[0], (
+        "Number of tokens mismatch")
+
+    gating_output = gating_output.float()
+    if scoring_func == "softmax":
+        scores = torch.softmax(gating_output, dim=-1)
+    elif scoring_func == "sigmoid":
+        scores = gating_output.sigmoid()
+    else:
+        raise ValueError(f"Unsupported scoring function: {scoring_func}")
+
+    num_token = scores.shape[0]
+    if e_score_correction_bias is not None:
+        original_scores = scores
+        scores = scores + e_score_correction_bias.unsqueeze(0)
+        group_scores = (scores.view(num_token, num_expert_group,
+                                    -1).topk(2, dim=-1)[0].sum(dim=-1))
+    else:
+        group_scores = scores.view(num_token, num_expert_group,
+                                   -1).max(dim=-1).values  # [n, n_group]
+    group_idx = torch.topk(group_scores, k=topk_group, dim=-1,
+                           sorted=False)[1]  # [n, top_k_group]
+    group_mask = torch.zeros_like(group_scores)  # [n, n_group]
+    group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
+    score_mask = group_mask.unsqueeze(-1).expand(
+        num_token, num_expert_group,
+        scores.shape[-1] // num_expert_group).reshape(num_token, -1)  # [n, e]
+    tmp_scores = scores.masked_fill(~score_mask.bool(),
+                                    float("-inf"))  # [n, e]
+
+    if e_score_correction_bias is not None:
+        topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)[1]
+        topk_weights = original_scores.gather(1, topk_ids)
+    else:
+        topk_weights, topk_ids = torch.topk(tmp_scores,
+                                            k=topk,
+                                            dim=-1,
+                                            sorted=False)
+
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+
+    return topk_weights, topk_ids.to(torch.int32)
+
+
+def select_experts(
+    hidden_states: torch.Tensor,
+    router_logits: torch.Tensor,
+    top_k: int,
+    use_grouped_topk: bool,
+    renormalize: bool,
+    topk_group: Optional[int] = None,
+    num_expert_group: Optional[int] = None,
+    custom_routing_function: Optional[Callable] = None,
+    scoring_func: str = "softmax",
+    e_score_correction_bias: Optional[torch.Tensor] = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    if use_grouped_topk:
+        assert topk_group is not None
+        assert num_expert_group is not None
+        return grouped_topk(hidden_states=hidden_states,
+                            gating_output=router_logits,
+                            topk=top_k,
+                            renormalize=renormalize,
+                            num_expert_group=num_expert_group,
+                            topk_group=topk_group,
+                            scoring_func=scoring_func,
+                            e_score_correction_bias=e_score_correction_bias)
+    elif custom_routing_function is None:
+        assert scoring_func == "softmax"
+        topk_weights = torch.nn.functional.softmax(router_logits,
+                                                   dim=1,
+                                                   dtype=torch.float32)
+        topk_weights, topk_ids = torch.topk(topk_weights, top_k, dim=-1)
+        if renormalize:
+            topk_weights /= topk_weights.sum(dim=-1, keepdim=True)
+        return topk_weights, topk_ids.to(torch.int32)
+    else:
+        return custom_routing_function(hidden_states=hidden_states,
+                                       gating_output=router_logits,
+                                       topk=top_k,
+                                       renormalize=renormalize)
+
+
 class IPEXFusedMOE:
 
     def __init__(self, layer: torch.nn.Module) -> None:
@@ -56,113 +156,6 @@ class SGLFusedMOE:
     def __init__(self, layer: torch.nn.Module) -> None:
         pass
 
-    @staticmethod
-    def _grouped_topk(
-        hidden_states: torch.Tensor,
-        gating_output: torch.Tensor,
-        topk: int,
-        renormalize: bool,
-        num_expert_group: int = 0,
-        topk_group: int = 0,
-        scoring_func: str = "softmax",
-        e_score_correction_bias: Optional[torch.Tensor] = None
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        assert hidden_states.shape[0] == gating_output.shape[0], (
-            "Number of tokens mismatch")
-
-        gating_output = gating_output.float()
-        if scoring_func == "softmax":
-            scores = torch.softmax(gating_output, dim=-1)
-        elif scoring_func == "sigmoid":
-            scores = gating_output.sigmoid()
-        else:
-            raise ValueError(f"Unsupported scoring function: {scoring_func}")
-
-        num_token = scores.shape[0]
-        if e_score_correction_bias is not None:
-            # Store original scores before applying correction bias. We use
-            # biased scores for expert selection but original scores for
-            # routing weights
-            original_scores = scores
-            scores = scores + e_score_correction_bias.unsqueeze(0)
-            group_scores = (scores.view(num_token, num_expert_group,
-                                        -1).topk(2, dim=-1)[0].sum(dim=-1))
-        else:
-            group_scores = scores.view(num_token, num_expert_group,
-                                       -1).max(dim=-1).values  # [n, n_group]
-        group_idx = torch.topk(group_scores,
-                               k=topk_group,
-                               dim=-1,
-                               sorted=False)[1]  # [n, top_k_group]
-        group_mask = torch.zeros_like(group_scores)  # [n, n_group]
-        group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
-        score_mask = group_mask.unsqueeze(-1).expand(
-            num_token, num_expert_group,
-            scores.shape[-1] // num_expert_group).reshape(num_token,
-                                                          -1)  # [n, e]
-        tmp_scores = scores.masked_fill(~score_mask.bool(),
-                                        float("-inf"))  # [n, e]
-
-        if e_score_correction_bias is not None:
-            topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)[1]
-            # Use original unbiased scores for the routing weights
-            topk_weights = original_scores.gather(1, topk_ids)
-        else:
-            topk_weights, topk_ids = torch.topk(tmp_scores,
-                                                k=topk,
-                                                dim=-1,
-                                                sorted=False)
-
-        if renormalize:
-            topk_weights = topk_weights / topk_weights.sum(dim=-1,
-                                                           keepdim=True)
-
-        return topk_weights, topk_ids.to(torch.int32)
-
-    @staticmethod
-    def _select_experts(
-        hidden_states: torch.Tensor,
-        router_logits: torch.Tensor,
-        top_k: int,
-        use_grouped_topk: bool,
-        renormalize: bool,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
-        custom_routing_function: Optional[Callable] = None,
-        scoring_func: str = "softmax",
-        e_score_correction_bias: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        # DeekSeekv2 uses grouped_top_k
-        if use_grouped_topk:
-            assert topk_group is not None
-            assert num_expert_group is not None
-            topk_weights, topk_ids = SGLFusedMOE._grouped_topk(
-                hidden_states=hidden_states,
-                gating_output=router_logits,
-                topk=top_k,
-                renormalize=renormalize,
-                num_expert_group=num_expert_group,
-                topk_group=topk_group,
-                scoring_func=scoring_func,
-                e_score_correction_bias=e_score_correction_bias)
-        elif custom_routing_function is None:
-            assert scoring_func == "softmax"
-            topk_weights = torch.nn.functional.softmax(router_logits,
-                                                       dim=1,
-                                                       dtype=torch.float32)
-            topk_weights, topk_ids = torch.topk(topk_weights, top_k, dim=-1)
-            if renormalize:
-                topk_weights /= topk_weights.sum(dim=-1, keepdim=True)
-            topk_ids = topk_ids.to(torch.int32)
-        else:
-            topk_weights, topk_ids = custom_routing_function(
-                hidden_states=hidden_states,
-                gating_output=router_logits,
-                topk=top_k,
-                renormalize=renormalize)
-
-        return topk_weights, topk_ids
-
     def __call__(
         self,
         layer: torch.nn.Module,
@@ -183,7 +176,7 @@ class SGLFusedMOE:
     ) -> torch.Tensor:
         assert activation == "silu", f"{activation} is not supported."
         assert not apply_router_weight_on_input
-        topk_weights, topk_ids = SGLFusedMOE._select_experts(
+        topk_weights, topk_ids = select_experts(
             hidden_states=x,
             router_logits=router_logits,
             use_grouped_topk=use_grouped_topk,
@@ -213,3 +206,80 @@ class SGLFusedMOE:
             True,
         )
         return x
+
+
+class CPUFusedMOE:
+
+    def __init__(self, layer: torch.nn.Module) -> None:
+        pass
+
+    def __call__(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+    ) -> torch.Tensor:
+        assert activation == "silu", f"{activation} is not supported."
+        assert not apply_router_weight_on_input
+        topk_weights, topk_ids = select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias,
+        )
+
+        # Ref code from https://github.com/sgl-project/sglang/blob/716e682721397df103f347d22da8bd46c6016dab/python/sglang/srt/layers/moe/fused_moe_native.py#L53
+        len_experts = global_num_experts
+
+        cnts = topk_ids.new_zeros((topk_ids.shape[0], len_experts))
+        cnts.scatter_(1, topk_ids.to(torch.int64), 1)
+        tokens_per_expert = cnts.sum(dim=0)
+        idxs = topk_ids.view(-1).argsort()
+
+        sorted_tokens = x[idxs // topk_ids.shape[1]]
+        tokens_per_expert = tokens_per_expert.cpu().numpy()
+
+        outputs = []
+        start_idx = 0
+        for i, num_tokens in enumerate(tokens_per_expert):
+            end_idx = start_idx + num_tokens
+            if num_tokens == 0:
+                continue
+            tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
+
+            layer_w13_weight = layer.w13_weight[i]
+            layer_w2_weight = layer.w2_weight[i]
+
+            gate_up = F.linear(tokens_for_this_expert, layer_w13_weight)
+            gate_up = silu_and_mul(gate_up)
+            expert_out = F.linear(gate_up, layer_w2_weight)
+            outputs.append(expert_out)
+            start_idx = end_idx
+
+        outs = torch.cat(outputs,
+                         dim=0) if len(outputs) else sorted_tokens.new_empty(0)
+        new_x = torch.empty_like(outs)
+
+        new_x[idxs] = outs
+        final_out = (new_x.view(
+            *topk_ids.shape, -1).type(topk_weights.dtype).mul_(
+                topk_weights.unsqueeze(dim=-1)).sum(dim=1).type(new_x.dtype))
+        return final_out
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index fcc6987d26bb2..54406a5a2d87f 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -358,8 +358,8 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                 use_prepack=True,
             )
         elif current_platform.is_cpu():
+            from vllm.model_executor.layers.fused_moe import cpu_fused_moe
             if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
-                from vllm.model_executor.layers.fused_moe import cpu_fused_moe
                 from vllm.model_executor.layers.utils import (
                     check_cpu_sgl_kernel)
                 dtype_w13 = layer.w13_weight.dtype
@@ -382,7 +382,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                 else:
                     layer.cpu_fused_moe = cpu_fused_moe.IPEXFusedMOE(layer)
             else:
-                raise NotImplementedError("CPU MOE only supports x86 arch.")
+                layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
 
     def apply(
         self,

From 1fdc732419d9b9eb00e003f38d6e02c480131ac8 Mon Sep 17 00:00:00 2001
From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Date: Tue, 26 Aug 2025 10:32:37 -0400
Subject: [PATCH 036/112] [ROCm] Starting to add AMD code reviewers for ROCm
 components (#23496)

Signed-off-by: Hongxia Yang <hongxia.yang@amd.com>
---
 .github/CODEOWNERS | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index ce9590f02ce71..c087fd555c661 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -79,4 +79,10 @@ mkdocs.yaml @hmellor
 /vllm/attention/ops/chunked_prefill_paged_decode.py @tdoublep
 /vllm/attention/ops/triton_unified_attention.py @tdoublep
 
+# ROCm related: specify owner with write access to notify AMD folks for careful code review
+/docker/Dockerfile.rocm* @gshtras
+/vllm/v1/attention/backends/rocm*.py @gshtras
+/vllm/v1/attention/backends/mla/rocm*.py @gshtras
+/vllm/attention/ops/rocm*.py @gshtras
+/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras
 

From 379f828fba68bcafec8b283acfd2b831fc35afb9 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 26 Aug 2025 16:43:28 +0100
Subject: [PATCH 037/112] [Docs] Reduce requirements for docs build (#23651)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/mkdocs/hooks/generate_argparse.py | 52 +++++++++++++++++------
 requirements/docs.txt                  | 14 -------
 vllm/sequence.py                       |  7 +++-
 vllm/transformers_utils/config.py      | 58 ++++++++++++--------------
 4 files changed, 72 insertions(+), 59 deletions(-)

diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py
index ed5d3b0092ae7..051a2d904406d 100644
--- a/docs/mkdocs/hooks/generate_argparse.py
+++ b/docs/mkdocs/hooks/generate_argparse.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import importlib
 import logging
 import sys
 from argparse import SUPPRESS, HelpFormatter
@@ -7,25 +8,52 @@ from pathlib import Path
 from typing import Literal
 from unittest.mock import MagicMock, patch
 
+from pydantic_core import core_schema
+
+logger = logging.getLogger("mkdocs")
+
 ROOT_DIR = Path(__file__).parent.parent.parent.parent
 ARGPARSE_DOC_DIR = ROOT_DIR / "docs/argparse"
 
 sys.path.insert(0, str(ROOT_DIR))
-sys.modules["aiohttp"] = MagicMock()
-sys.modules["blake3"] = MagicMock()
 sys.modules["vllm._C"] = MagicMock()
 
-from vllm.benchmarks import latency  # noqa: E402
-from vllm.benchmarks import serve  # noqa: E402
-from vllm.benchmarks import throughput  # noqa: E402
-from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs  # noqa: E402
-from vllm.entrypoints.cli.openai import ChatCommand  # noqa: E402
-from vllm.entrypoints.cli.openai import CompleteCommand  # noqa: E402
-from vllm.entrypoints.openai import cli_args  # noqa: E402
-from vllm.entrypoints.openai import run_batch  # noqa: E402
-from vllm.utils import FlexibleArgumentParser  # noqa: E402
 
-logger = logging.getLogger("mkdocs")
+class PydanticMagicMock(MagicMock):
+    """`MagicMock` that's able to generate pydantic-core schemas."""
+
+    def __get_pydantic_core_schema__(self, source_type, handler):
+        return core_schema.any_schema()
+
+
+def auto_mock(module, attr, max_mocks=50):
+    """Function that automatically mocks missing modules during imports."""
+    logger.info("Importing %s from %s", attr, module)
+    for _ in range(max_mocks):
+        try:
+            # First treat attr as an attr, then as a submodule
+            return getattr(importlib.import_module(module), attr,
+                           importlib.import_module(f"{module}.{attr}"))
+        except importlib.metadata.PackageNotFoundError as e:
+            raise e
+        except ModuleNotFoundError as e:
+            logger.info("Mocking %s for argparse doc generation", e.name)
+            sys.modules[e.name] = PydanticMagicMock()
+
+    raise ImportError(
+        f"Failed to import {module}.{attr} after mocking {max_mocks} imports")
+
+
+latency = auto_mock("vllm.benchmarks", "latency")
+serve = auto_mock("vllm.benchmarks", "serve")
+throughput = auto_mock("vllm.benchmarks", "throughput")
+AsyncEngineArgs = auto_mock("vllm.engine.arg_utils", "AsyncEngineArgs")
+EngineArgs = auto_mock("vllm.engine.arg_utils", "EngineArgs")
+ChatCommand = auto_mock("vllm.entrypoints.cli.openai", "ChatCommand")
+CompleteCommand = auto_mock("vllm.entrypoints.cli.openai", "CompleteCommand")
+cli_args = auto_mock("vllm.entrypoints.openai", "cli_args")
+run_batch = auto_mock("vllm.entrypoints.openai", "run_batch")
+FlexibleArgumentParser = auto_mock("vllm.utils", "FlexibleArgumentParser")
 
 
 class MarkdownFormatter(HelpFormatter):
diff --git a/requirements/docs.txt b/requirements/docs.txt
index a24b9c7e924bf..3b72a8a9e755e 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -14,20 +14,6 @@ ruff
 # Required for argparse hook only
 -f https://download.pytorch.org/whl/cpu
 cachetools
-cbor2
-cloudpickle
-fastapi
 msgspec
-openai
-openai-harmony
-partial-json-parser
-pillow
-psutil
-pybase64
 pydantic
-setproctitle
 torch
-transformers
-zmq
-uvloop
-prometheus-client
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 43d5c8beef270..3c4c77aea5ed8 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -16,14 +16,17 @@ import msgspec
 import torch
 
 from vllm.inputs import SingletonInputs
-from vllm.lora.request import LoRARequest
 from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import RequestOutputKind, SamplingParams
 
 if TYPE_CHECKING:
+    from vllm.lora.request import LoRARequest
     from vllm.v1.worker.kv_connector_model_runner_mixin import (
         KVConnectorOutput)
+else:
+    LoRARequest = Any
+    KVConnectorOutput = Any
 
 VLLM_TOKEN_ID_ARRAY_TYPE = "l"
 
@@ -1138,7 +1141,7 @@ class IntermediateTensors:
     """
 
     tensors: dict[str, torch.Tensor]
-    kv_connector_output: Optional["KVConnectorOutput"]
+    kv_connector_output: Optional[KVConnectorOutput]
 
     def __init__(self, tensors):
         # manually define this function, so that
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 674c820daba29..2cd799e5eb5a9 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -27,19 +27,6 @@ from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
 
 from vllm import envs
 from vllm.logger import init_logger
-# yapf conflicts with isort for this block
-# yapf: disable
-from vllm.transformers_utils.configs import (ChatGLMConfig, DeepseekVLV2Config,
-                                             EAGLEConfig, JAISConfig,
-                                             KimiVLConfig, MedusaConfig,
-                                             MLPSpeculatorConfig,
-                                             Nemotron_Nano_VL_Config,
-                                             NemotronConfig, OvisConfig,
-                                             RWConfig, SpeculatorsConfig,
-                                             Step3TextConfig, Step3VLConfig,
-                                             UltravoxConfig)
-# yapf: enable
-from vllm.transformers_utils.configs.mistral import adapt_config_dict
 from vllm.transformers_utils.utils import check_gguf_file
 
 if envs.VLLM_USE_MODELSCOPE:
@@ -67,24 +54,31 @@ def _get_hf_token() -> Optional[str]:
     return None
 
 
-_CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = {
-    "chatglm": ChatGLMConfig,
-    "deepseek_vl_v2": DeepseekVLV2Config,
-    "kimi_vl": KimiVLConfig,
-    "Llama_Nemotron_Nano_VL": Nemotron_Nano_VL_Config,
-    "RefinedWeb": RWConfig,  # For tiiuae/falcon-40b(-instruct)
-    "RefinedWebModel": RWConfig,  # For tiiuae/falcon-7b(-instruct)
-    "jais": JAISConfig,
-    "mlp_speculator": MLPSpeculatorConfig,
-    "medusa": MedusaConfig,
-    "eagle": EAGLEConfig,
-    "speculators": SpeculatorsConfig,
-    "nemotron": NemotronConfig,
-    "ovis": OvisConfig,
-    "ultravox": UltravoxConfig,
-    "step3_vl": Step3VLConfig,
-    "step3_text": Step3TextConfig,
-}
+class LazyConfigDict(dict):
+
+    def __getitem__(self, key):
+        import vllm.transformers_utils.configs as configs
+        return getattr(configs, super().__getitem__(key))
+
+
+_CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
+    chatglm="ChatGLMConfig",
+    deepseek_vl_v2="DeepseekVLV2Config",
+    kimi_vl="KimiVLConfig",
+    Llama_Nemotron_Nano_VL="Nemotron_Nano_VL_Config",
+    RefinedWeb="RWConfig",  # For tiiuae/falcon-40b(-instruct)
+    RefinedWebModel="RWConfig",  # For tiiuae/falcon-7b(-instruct)
+    jais="JAISConfig",
+    mlp_speculator="MLPSpeculatorConfig",
+    medusa="MedusaConfig",
+    eagle="EAGLEConfig",
+    speculators="SpeculatorsConfig",
+    nemotron="NemotronConfig",
+    ovis="OvisConfig",
+    ultravox="UltravoxConfig",
+    step3_vl="Step3VLConfig",
+    step3_text="Step3TextConfig",
+)
 
 _CONFIG_ATTRS_MAPPING: dict[str, str] = {
     "llm_config": "text_config",
@@ -461,6 +455,8 @@ def get_config(
                 model, revision, **kwargs)
             config_dict["max_position_embeddings"] = max_position_embeddings
 
+        from vllm.transformers_utils.configs.mistral import adapt_config_dict
+
         config = adapt_config_dict(config_dict)
 
         # Mistral configs may define sliding_window as list[int]. Convert it

From 513298f1b44157f7ae2f7007ef7b17c2929d11d4 Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: Tue, 26 Aug 2025 23:47:50 +0800
Subject: [PATCH 038/112] [Bugfix] fix bf16 multimodal model hash (#23623)

Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/multimodal/hasher.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py
index 210a4ec762879..479961776a6a0 100644
--- a/vllm/multimodal/hasher.py
+++ b/vllm/multimodal/hasher.py
@@ -43,7 +43,19 @@ class MultiModalHasher:
             return cls.item_to_bytes(
                 "image", np.asarray(convert_image_mode(obj, "RGBA")))
         if isinstance(obj, torch.Tensor):
-            return cls.item_to_bytes("tensor", obj.cpu().numpy())
+            tensor_obj: torch.Tensor = obj.cpu()
+            tensor_dtype = tensor_obj.dtype
+            if tensor_dtype == torch.bfloat16:
+                tensor_obj = tensor_obj.contiguous()
+                tensor_obj = tensor_obj.view(
+                    (tensor_obj.numel(), )).view(torch.uint8)
+                return cls.item_to_bytes(
+                    "tensor", {
+                        "original_dtype": str(tensor_dtype),
+                        "original_shape": tuple(tensor_obj.shape),
+                        "data": tensor_obj.numpy()
+                    })
+            return cls.item_to_bytes("tensor", tensor_obj.numpy())
         if isinstance(obj, np.ndarray):
             # If the array is non-contiguous, we need to copy it first
             arr_data = obj.data if obj.flags.c_contiguous else obj.tobytes()

From 9d4183dd2e751e94442d7f02966d33cc085de708 Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: Tue, 26 Aug 2025 23:48:08 +0800
Subject: [PATCH 039/112] [model] support qwen2audio embedding input (#23625)

Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 .../models/qwen2_5_omni_thinker.py            |  13 ++-
 vllm/model_executor/models/qwen2_audio.py     | 109 ++++++++++++++----
 2 files changed, 93 insertions(+), 29 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index a61b8ca8f7ae7..5c64c81547e65 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -47,7 +47,7 @@ from vllm.model_executor.models.qwen2_5_vl import (
     Qwen2_5_VLProcessingInfo, Qwen2_5_VLVideoEmbeddingInputs,
     Qwen2_5_VLVideoInputs, Qwen2_5_VLVideoPixelInputs)
 from vllm.model_executor.models.qwen2_audio import (
-    Qwen2AudioInputs, Qwen2AudioProcessingInfo,
+    Qwen2AudioFeatureInputs, Qwen2AudioProcessingInfo,
     _get_feat_extract_output_lengths)
 from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalDataParser
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -534,7 +534,7 @@ class Qwen2_5OmniConditionalGenerationMixin:
             return torch.concat(mm_input, dim=dim)
 
     def _parse_and_validate_audio_input(
-            self, **kwargs: object) -> Optional[Qwen2AudioInputs]:
+            self, **kwargs: object) -> Optional[Qwen2AudioFeatureInputs]:
         input_audio_features = kwargs.pop('input_audio_features', None)
         audio_feature_lengths = kwargs.pop('audio_feature_lengths', None)
         feature_attention_mask = kwargs.pop('feature_attention_mask', None)
@@ -548,9 +548,10 @@ class Qwen2_5OmniConditionalGenerationMixin:
         if not isinstance(input_audio_features, (torch.Tensor, list)):
             raise ValueError("Incorrect type of audio input features. "
                              f"Got type: {type(input_audio_features)}")
-        return Qwen2AudioInputs(input_features=input_audio_features,
-                                audio_feature_lengths=audio_feature_lengths,
-                                feature_attention_mask=feature_attention_mask)
+        return Qwen2AudioFeatureInputs(
+            input_features=input_audio_features,
+            audio_feature_lengths=audio_feature_lengths,
+            feature_attention_mask=feature_attention_mask)
 
     def _parse_and_validate_image_input(
         self,
@@ -630,7 +631,7 @@ class Qwen2_5OmniConditionalGenerationMixin:
 
     def _process_audio_input(
         self,
-        audio_input: Qwen2AudioInputs,
+        audio_input: Qwen2AudioFeatureInputs,
         audio_hashes: list[str] = None,
         cached_audio_features: torch.Tensor = None,
     ) -> torch.Tensor:
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 86c567ca36174..86b4a9a018c76 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -23,7 +23,7 @@
 # limitations under the License.
 """Inference-only Qwen2-Audio model compatible with HuggingFace weights."""
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Any, Optional, TypedDict, Union
+from typing import Any, Literal, Optional, TypedDict, Union
 
 import torch
 import torch.nn as nn
@@ -36,9 +36,11 @@ from transformers.models.whisper import WhisperFeatureExtractor
 from vllm.config import VllmConfig
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+from vllm.multimodal.inputs import (AudioItem, ModalityData,
+                                    MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalKwargsItems)
-from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
+from vllm.multimodal.parse import (AudioProcessorItems, DictEmbeddingItems,
+                                   ModalityDataItems, MultiModalDataItems,
                                    MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
@@ -52,7 +54,8 @@ from .utils import (AutoWeightsLoader, init_vllm_registered_model,
 
 
 # # === Audio Inputs === #
-class Qwen2AudioInputs(TypedDict):
+class Qwen2AudioFeatureInputs(TypedDict):
+    type: Literal["audio_features"]
     input_features: torch.Tensor
     """Shape: `(num_audios, num_mel_bins, 3000)`"""
 
@@ -60,6 +63,16 @@ class Qwen2AudioInputs(TypedDict):
     """Shape: `(num_audios, 3000)`"""
 
 
+class Qwen2AudioEmbeddingInputs(TypedDict):
+    type: Literal["audio_embeds"]
+    audio_embeds: list[torch.Tensor]
+    """Shape: `(num_audio_features, hidden_size)`
+    `hidden_size` must match the hidden size of language model backbone.
+    """
+
+
+Qwen2AudioInputs = Union[Qwen2AudioFeatureInputs, Qwen2AudioEmbeddingInputs]
+
 # === Audio Encoder === #
 
 
@@ -128,12 +141,38 @@ class Qwen2AudioDummyInputsBuilder(
         }
 
 
+def _qwen2audio_field_config(hf_inputs: Mapping[str, torch.Tensor]):
+    return dict(
+        audio_embeds=MultiModalFieldConfig.batched("audio"),
+        input_features=MultiModalFieldConfig.batched("audio"),
+        feature_attention_mask=MultiModalFieldConfig.batched("audio"),
+    )
+
+
+class Qwen2AudioMultiModalDataParser(MultiModalDataParser):
+
+    def _parse_audio_data(
+        self,
+        data: Union[dict[str, torch.Tensor], ModalityData[AudioItem]],
+    ) -> Optional[ModalityDataItems[Any, Any]]:
+        if isinstance(data, dict):
+            return DictEmbeddingItems(
+                data,
+                modality="audio",
+                required_fields={"audio_embeds"},
+                fields_factory=_qwen2audio_field_config,
+            )
+
+        return super()._parse_audio_data(data)
+
+
 class Qwen2AudioMultiModalProcessor(
         BaseMultiModalProcessor[Qwen2AudioProcessingInfo]):
 
     def _get_data_parser(self) -> MultiModalDataParser:
         feature_extractor = self.info.get_feature_extractor()
-        return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
+        return Qwen2AudioMultiModalDataParser(
+            target_sr=feature_extractor.sampling_rate)
 
     def _call_hf_processor(
         self,
@@ -173,10 +212,7 @@ class Qwen2AudioMultiModalProcessor(
         hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(
-            input_features=MultiModalFieldConfig.batched("audio"),
-            feature_attention_mask=MultiModalFieldConfig.batched("audio"),
-        )
+        return _qwen2audio_field_config(hf_inputs)
 
     def _get_prompt_updates(
         self,
@@ -184,6 +220,7 @@ class Qwen2AudioMultiModalProcessor(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
+
         processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         tokenizer = self.info.get_tokenizer()
         vocab = tokenizer.get_vocab()
@@ -211,7 +248,15 @@ class Qwen2AudioMultiModalProcessor(
             audio_output_lengths = audio_output_lens.tolist()
 
         def get_replacement_qwen2_audio(item_idx: int):
-            num_features = audio_output_lengths[item_idx]
+
+            if audio_output_lengths:
+                num_features = audio_output_lengths[item_idx]
+            else:
+                audio_embeds = out_mm_data["audio_embeds"][item_idx]
+                assert len(audio_embeds.shape
+                           ) == 2, "audio_embeds must be a 2D tensor"
+                num_features = audio_embeds.shape[0]
+
             if num_features == 0:
                 audios = mm_items.get_items("audio", AudioProcessorItems)
                 audio_len = audios.get_audio_length(item_idx)
@@ -286,21 +331,39 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
     def _parse_and_validate_audio_input(
             self, **kwargs: object) -> Optional[Qwen2AudioInputs]:
         input_features = kwargs.pop('input_features', None)
+        audio_embeds = kwargs.pop('audio_embeds', None)
         feature_attention_mask = kwargs.pop('feature_attention_mask', None)
-        if input_features is None:
-            return None
-        input_features = self._validate_and_reshape_mm_tensor(
-            input_features, 'input_features')
-        feature_attention_mask = self._validate_and_reshape_mm_tensor(
-            feature_attention_mask, 'feature_attention_mask')
-        if not isinstance(input_features, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of audio input features. "
-                             f"Got type: {type(input_features)}")
-        return Qwen2AudioInputs(input_features=input_features,
-                                feature_attention_mask=feature_attention_mask)
 
-    def _process_audio_input(self,
-                             audio_input: Qwen2AudioInputs) -> torch.Tensor:
+        if input_features is None and audio_embeds is None:
+            return None
+
+        if audio_embeds is not None:
+            if not isinstance(audio_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of audio embeds. "
+                                 f"Got type: {type(audio_embeds)}")
+            audio_embeds = self._validate_and_reshape_mm_tensor(
+                audio_embeds, "audio_embeds")
+            return Qwen2AudioEmbeddingInputs(type="audio_embeds",
+                                             audio_embeds=audio_embeds)
+
+        if input_features is not None:
+            input_features = self._validate_and_reshape_mm_tensor(
+                input_features, 'input_features')
+            feature_attention_mask = self._validate_and_reshape_mm_tensor(
+                feature_attention_mask, 'feature_attention_mask')
+            return Qwen2AudioFeatureInputs(
+                type="audio_features",
+                input_features=input_features,
+                feature_attention_mask=feature_attention_mask)
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _process_audio_input(
+        self, audio_input: Qwen2AudioInputs
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+        if audio_input["type"] == "audio_embeds":
+            audio_embeds = audio_input["audio_embeds"]
+            return tuple(audio_embeds)
 
         input_features = audio_input["input_features"]
         feature_attention_mask = audio_input["feature_attention_mask"]

From 7ea22e42d5f666a26b3ce4117724dadfdb4d3887 Mon Sep 17 00:00:00 2001
From: nvjullin <jullin@nvidia.com>
Date: Tue, 26 Aug 2025 23:53:04 +0800
Subject: [PATCH 040/112] [Misc] Add override for allreduce fusion thresholds
 (#23639)

Signed-off-by: Julien Lin <jullin@nvidia.com>
---
 vllm/compilation/collective_fusion.py | 13 +++++++++++++
 vllm/envs.py                          | 11 +++++++++++
 2 files changed, 24 insertions(+)

diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py
index c44ac8e0aa7ea..0c545d8cffd24 100644
--- a/vllm/compilation/collective_fusion.py
+++ b/vllm/compilation/collective_fusion.py
@@ -10,6 +10,7 @@ from torch._higher_order_ops.auto_functionalize import auto_functionalized
 from torch._inductor.pattern_matcher import PatternMatcherPass
 from torch.distributed._symmetric_memory import enable_symm_mem_for_group
 
+import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.distributed import get_tp_group, tensor_model_parallel_all_reduce
 from vllm.distributed.parallel_state import (
@@ -401,6 +402,18 @@ if flashinfer_comm is not None:
         6: MiB // 2,  # 512KB
         8: MiB // 2,  # 512KB
     }
+
+    try:
+        _FI_MAX_SIZES.update({
+            int(k): int(float(v) * MiB)
+            for k, v in
+            envs.VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB.items()
+        })
+    except Exception as e:
+        raise ValueError(
+            "Failed to parse VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB: "
+            + str(e)) from e
+
     # opt for a more conservative default value
     # when world size is not in _FI_MAX_SIZES
     _DEFAULT_FI_MAX_SIZE = MiB // 2
diff --git a/vllm/envs.py b/vllm/envs.py
index 1c9c4cdde8001..66c7c2c7f2c4d 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import hashlib
+import json
 import os
 import sys
 import tempfile
@@ -1046,6 +1047,16 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE":
     lambda: int(os.getenv("VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE", "163840")),
 
+    # Specifies the thresholds of the communicated tensor sizes under which
+    # vllm should use flashinfer fused allreduce. The variable should be a
+    # JSON with the following format:
+    #     { <world size>: <max size in mb> }
+    # Unspecified world sizes will fallback to
+    #     { 2: 64, 4: 1, <everything else>: 0.5 }
+    "VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB":
+    lambda: json.loads(os.getenv(
+        "VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB", "{}")),
+
     # MoE routing strategy selector.
     # See `RoutingSimulator.get_available_strategies()` # for available
     # strategies.

From 44ac25eae2cbbdc1cbcca423777107a5ca90a8f4 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Wed, 27 Aug 2025 00:20:13 +0800
Subject: [PATCH 041/112] [CI] [Doc]: Add GH Action for auto labeling issues
 with `rocm` tag (#20988)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 .github/workflows/issue_autolabel.yml | 305 ++++++++++++++++++++++++++
 1 file changed, 305 insertions(+)
 create mode 100644 .github/workflows/issue_autolabel.yml

diff --git a/.github/workflows/issue_autolabel.yml b/.github/workflows/issue_autolabel.yml
new file mode 100644
index 0000000000000..6401d6586cc3d
--- /dev/null
+++ b/.github/workflows/issue_autolabel.yml
@@ -0,0 +1,305 @@
+name: Label issues based on keywords
+on:
+  issues:
+    types: [opened, edited, reopened]
+permissions:
+  issues: write          # needed so the workflow can add labels
+  contents: read
+concurrency:
+  group: issue-labeler-${{ github.event.issue.number }}
+  cancel-in-progress: true
+jobs:
+  add-labels:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Label issues based on keywords
+        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea  # v7.0.1
+        with:
+          script: |
+            // Configuration: Add new labels and keywords here
+            const labelConfig = {
+              rocm: {
+                // Keyword search - matches whole words only (with word boundaries)
+                keywords: [
+                  {
+                    term: "composable kernel",
+                    searchIn: "both"
+                  },
+                  {
+                    term: "rccl",
+                    searchIn: "body"  // only search in body
+                  },
+                  {
+                    term: "migraphx",
+                    searchIn: "title"  // only search in title
+                  },
+                  {
+                    term: "hipgraph",
+                    searchIn: "both"
+                  },
+                  {
+                    term: "ROCm System Management Interface",
+                    searchIn: "body"
+                  },
+                ],
+                
+                // Substring search - matches anywhere in text (partial matches)
+                substrings: [
+                  {
+                    term: "VLLM_ROCM_",
+                    searchIn: "both"
+                  },
+                  {
+                    term: "rocm",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "amd",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "hip-",
+                    searchIn: "both"
+                  },
+                  {
+                    term: "gfx",
+                    searchIn: "both"
+                  },
+                  {
+                    term: "cdna",
+                    searchIn: "both"
+                  },
+                  {
+                    term: "rdna",
+                    searchIn: "both"
+                  },
+                  {
+                    term: "torch_hip",
+                    searchIn: "body"  // only in body
+                  },
+                  {
+                    term: "_hip",
+                    searchIn: "both"
+                  },
+                  {
+                    term: "hip_",
+                    searchIn: "both"
+                  },
+                  
+                  // ROCm tools and libraries
+                  {
+                    term: "hipify",
+                    searchIn: "both"
+                  },
+                ],
+                
+                // Regex patterns - for complex pattern matching
+                regexPatterns: [
+                  {
+                    pattern: "\\bmi\\d{3}[a-z]*\\b",
+                    description: "AMD GPU names (mi + 3 digits + optional letters)",
+                    flags: "gi",
+                    searchIn: "both"  // "title", "body", or "both"
+                  }
+                ],
+              },
+            };
+            
+            // Helper function to create regex based on search type
+            function createSearchRegex(term, type) {
+              // Escape special regex characters in the term
+              const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
+              
+              switch (type) {
+                case 'keyword':
+                  // Word boundary search - matches whole words only
+                  return new RegExp(`\\b${escapedTerm}\\b`, "gi");
+                case 'substring':
+                  // Substring search - matches anywhere in the text
+                  return new RegExp(escapedTerm, "gi");
+                default:
+                  throw new Error(`Unknown search type: ${type}`);
+              }
+            }
+            
+            // Helper function to find matching terms in text with line information
+            function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
+              const matches = [];
+              const lines = text.split('\n');
+              
+              for (const termConfig of searchTerms) {
+                let regex;
+                let term, searchIn, pattern, description, flags;
+                
+                // Handle different input formats (string or object)
+                if (typeof termConfig === 'string') {
+                  term = termConfig;
+                  searchIn = 'both'; // default
+                } else {
+                  term = termConfig.term;
+                  searchIn = termConfig.searchIn || 'both';
+                  pattern = termConfig.pattern;
+                  description = termConfig.description;
+                  flags = termConfig.flags;
+                }
+                
+                // Skip if this term shouldn't be searched in the current location
+                if (searchIn !== 'both' && searchIn !== searchLocation) {
+                  continue;
+                }
+                
+                // Create appropriate regex
+                if (searchType === 'regex') {
+                  regex = new RegExp(pattern, flags || "gi");
+                } else {
+                  regex = createSearchRegex(term, searchType);
+                }
+                
+                const termMatches = [];
+                
+                // Check each line for matches
+                lines.forEach((line, lineIndex) => {
+                  const lineMatches = line.match(regex);
+                  if (lineMatches) {
+                    lineMatches.forEach(match => {
+                      termMatches.push({
+                        match: match,
+                        lineNumber: lineIndex + 1,
+                        lineContent: line.trim(),
+                        searchType: searchType,
+                        searchLocation: searchLocation,
+                        originalTerm: term || pattern,
+                        description: description,
+                        // Show context around the match in the line
+                        context: line.length > 100 ? 
+                          line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30), 
+                                       line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...' 
+                          : line.trim()
+                      });
+                    });
+                  }
+                });
+                
+                if (termMatches.length > 0) {
+                  matches.push({
+                    term: term || (description || pattern),
+                    searchType: searchType,
+                    searchLocation: searchLocation,
+                    searchIn: searchIn,
+                    pattern: pattern,
+                    matches: termMatches,
+                    count: termMatches.length
+                  });
+                }
+              }
+              
+              return matches;
+            }
+            
+            // Helper function to check if label should be added
+            async function processLabel(labelName, config) {
+              const body = context.payload.issue.body || "";
+              const title = context.payload.issue.title || "";
+              
+              core.notice(`Processing label: ${labelName}`);
+              core.notice(`Issue Title: "${title}"`);
+              core.notice(`Issue Body length: ${body.length} characters`);
+              
+              let shouldAddLabel = false;
+              let allMatches = [];
+              let reason = '';
+              
+              const keywords = config.keywords || [];
+              const substrings = config.substrings || [];
+              const regexPatterns = config.regexPatterns || [];
+              
+              core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
+              
+              // Search in title
+              if (title.trim()) {
+                core.notice(`Searching in title: "${title}"`);
+                
+                const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
+                const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
+                const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
+                
+                allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
+              }
+              
+              // Search in body
+              if (body.trim()) {
+                core.notice(`Searching in body (${body.length} characters)`);
+                
+                const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
+                const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
+                const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
+                
+                allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
+              }
+              
+              if (allMatches.length > 0) {
+                core.notice(`Found ${allMatches.length} matching term(s):`);
+                
+                for (const termMatch of allMatches) {
+                  const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
+                  const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
+                  
+                  if (termMatch.searchType === 'regex') {
+                    core.notice(`  📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
+                  } else {
+                    core.notice(`  📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
+                  }
+                  
+                  // Show details for each match
+                  termMatch.matches.forEach((match, index) => {
+                    core.notice(`    ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
+                    if (match.description) {
+                      core.notice(`       Description: ${match.description}`);
+                    }
+                    core.notice(`       Context: ${match.context}`);
+                    if (match.lineContent !== match.context) {
+                      core.notice(`       Full line: ${match.lineContent}`);
+                    }
+                  });
+                }
+                
+                shouldAddLabel = true;
+                const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
+                const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
+                const bodyMatches = allMatches.filter(t => t.searchLocation === 'body').reduce((sum, t) => sum + t.count, 0);
+                const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
+                const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
+                const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
+                
+                reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
+              }
+              
+              core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
+              core.notice(`Reason: ${reason || 'No matching terms found'}`);
+              
+              if (shouldAddLabel) {
+                const existingLabels = context.payload.issue.labels.map(l => l.name);
+                if (!existingLabels.includes(labelName)) {
+                  await github.rest.issues.addLabels({
+                    owner: context.repo.owner,
+                    repo: context.repo.repo,
+                    issue_number: context.issue.number,
+                    labels: [labelName],
+                  });
+                  core.notice(`Label "${labelName}" added. ${reason}`);
+                  return true;
+                }
+                core.notice(`Label "${labelName}" already present.`);
+                return false;
+              }
+              
+              core.notice(`No matching terms found for label "${labelName}".`);
+              return false;
+            }
+            
+            // Process all configured labels
+            const processLabels = Object.entries(labelConfig)
+              .map(([labelName, config]) => processLabel(labelName, config));
+            const labelsAdded = await Promise.all(processLabels);
+            const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
+            core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
\ No newline at end of file

From 9b0187003e62bdb7311b23b5b5026ea8e4e207d3 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Wed, 27 Aug 2025 01:10:42 +0800
Subject: [PATCH 042/112] [Bugfix] Fix cuda event usage with CPU model runner
 (#23643)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 vllm/v1/worker/cpu_model_runner.py | 28 +++++++++++++++++++++++++---
 vllm/v1/worker/gpu_model_runner.py |  2 +-
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py
index a7180afbd64b5..137578f0e6088 100644
--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -11,6 +11,7 @@ from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
 from vllm.v1.attention.backends.cpu_attn import TorchSDPAMetadataBuilderV1
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
+from vllm.v1.worker.utils import CpuGpuBuffer
 
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
@@ -21,7 +22,8 @@ logger = init_logger(__name__)
 class CPUModelRunner(GPUModelRunner):
 
     def __init__(self, vllm_config: VllmConfig, device: torch.device):
-        super().__init__(vllm_config, device)
+        with _torch_cuda_wrapper():
+            super().__init__(vllm_config, device)
 
         assert device == torch.device("cpu")
         assert self.speculative_config is None, "spec decode is not supported."
@@ -71,8 +73,8 @@ class CPUModelRunner(GPUModelRunner):
                 setattr(obj, device_attr_name, cpu_tensor)
 
         for k, v in vars(self).items():
-            if k.endswith("_cpu") and isinstance(v, torch.Tensor):
-                replace_tensor(self, k, k[:-4])
+            if isinstance(v, CpuGpuBuffer):
+                v.gpu = v.cpu
 
         for k, v in vars(self.input_batch).items():
             if k.endswith("_cpu_tensor") and isinstance(v, torch.Tensor):
@@ -108,6 +110,26 @@ class CPUModelRunner(GPUModelRunner):
     def _sync_device(self) -> None:
         pass
 
+    def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]:
+        return sampled_token_ids.tolist()
+
+
+@contextmanager
+def _torch_cuda_wrapper():
+
+    class _EventPlaceholder:
+
+        def __init__(self, *args, **kwargs) -> None:
+            self.record = lambda: None
+            self.synchronize = lambda: None
+
+    try:
+        cuda_event = torch.cuda.Event
+        torch.cuda.Event = _EventPlaceholder
+        yield
+    finally:
+        torch.cuda.Event = cuda_event
+
 
 @contextmanager
 def _set_global_compilation_settings(config: VllmConfig):
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 14f2305dadc54..f1ceaaae62a70 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -321,7 +321,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             (self.max_model_len, 1),
             dtype=torch.int64,
             device="cpu",
-            pin_memory=True)
+            pin_memory=self.pin_memory)
 
     def _make_buffer(self, *args, dtype: torch.dtype) -> CpuGpuBuffer:
         return CpuGpuBuffer(*args,

From 730d0ac8b9678d64294ddc1e3431a27a50b5e42f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hyogeun=20Oh=20=28=EC=98=A4=ED=9A=A8=EA=B7=BC=29?=
 <ohg3417@gmail.com>
Date: Wed, 27 Aug 2025 03:19:23 +0900
Subject: [PATCH 043/112] [Docs] Fix warnings in `mkdocs build` (#23649)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Zerohertz <ohg3417@gmail.com>
Signed-off-by: Hyogeun Oh (오효근) <ohg3417@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .../backends/differential_flash_attn.py       | 14 ++++++----
 vllm/attention/backends/flash_attn.py         |  5 ++--
 vllm/attention/backends/rocm_flash_attn.py    | 11 ++++----
 vllm/attention/backends/utils.py              |  2 +-
 vllm/attention/backends/xformers.py           | 12 ++++-----
 vllm/core/block_manager.py                    |  8 +++---
 vllm/engine/async_llm_engine.py               |  4 +--
 vllm/engine/llm_engine.py                     |  8 +++---
 vllm/entrypoints/llm.py                       | 10 +++----
 .../tool_parsers/minimax_tool_parser.py       |  3 ++-
 vllm/model_executor/layers/lightning_attn.py  | 11 +++++++-
 vllm/model_executor/layers/linear.py          |  5 ++--
 vllm/outputs.py                               |  4 +--
 vllm/sequence.py                              | 27 +++++++------------
 14 files changed, 66 insertions(+), 58 deletions(-)

diff --git a/vllm/attention/backends/differential_flash_attn.py b/vllm/attention/backends/differential_flash_attn.py
index ce9467efd23c7..caa02530d2fd6 100644
--- a/vllm/attention/backends/differential_flash_attn.py
+++ b/vllm/attention/backends/differential_flash_attn.py
@@ -805,14 +805,18 @@ class DifferentialFlashAttentionImpl(AttentionImpl):
         """Forward pass with FlashAttention.
 
         Args:
-            query: shape = [num_tokens, num_heads, head_size]
-            key: shape = [num_tokens, num_kv_heads, head_size]
-            value: shape = [num_tokens, num_kv_heads, head_size]
-            output: shape = [num_tokens, num_heads, head_size]
-            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            layer: Attention layer instance.
+            q: Query tensor with shape = [num_tokens, num_heads, head_size]
+            k: Key tensor with shape = [num_tokens, num_kv_heads, head_size]
+            v: Value tensor with shape = [num_tokens, num_kv_heads, head_size]
+            kv_cache: KV cache tensor with shape 
+                [2, num_blocks, block_size, num_kv_heads, head_size].
                 NOTE: kv_cache will be an empty tensor with shape [0]
                 for profiling run.
             attn_metadata: Metadata for attention.
+            output: Output tensor with shape [num_tokens, num_heads, head_size]
+            output_scale: Optional output scale tensor.
+            output_block_scale: Optional output block scale tensor.
         NOTE: It in-place updates the output tensor.
         NOTE: FP8 quantization, flash-attn expect the size of
               {q,k,v}_descale to be (num_sequences, num_kv_heads).
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index ba7a9afe86782..d8cb208c4f2ea 100755
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -605,7 +605,8 @@ class FlashAttentionImpl(AttentionImpl):
             key: shape = [num_tokens, num_kv_heads, head_size]
             value: shape = [num_tokens, num_kv_heads, head_size]
             output: shape = [num_tokens, num_heads, head_size]
-            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            kv_cache: KV cache tensor with shape 
+                [2, num_blocks, block_size, num_kv_heads, head_size].
                 NOTE: kv_cache will be an empty tensor with shape [0]
                 for profiling run.
             attn_metadata: Metadata for attention.
@@ -850,7 +851,7 @@ class FlashAttentionImpl(AttentionImpl):
 
 
 def _get_query_key_seq_metadata(
-    attn_metadata,
+    attn_metadata: FlashAttentionMetadata,
     is_prompt: bool,
     attn_type: str,
 ) -> tuple:
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index e4c27a0ef36e9..9262144e37b54 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -584,17 +584,18 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                 use prefill sequence attributes
 
         Args:
+            layer: Attention layer instance.
             query: shape = [num_tokens, num_heads * head_size]
             key: shape = [num_tokens, num_kv_heads * head_size]
             value: shape = [num_tokens, num_kv_heads * head_size]
-            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
+            kv_cache: KV cache tensor with shape 
+                [2, num_blocks, block_size * num_kv_heads * head_size].
                 NOTE: kv_cache will be an empty tensor with shape [0]
                 for profiling run.
             attn_metadata: Metadata for attention.
-            attn_type: Select attention type, between encoder attention,
-                       decoder self-attention, or encoder/decoder cross-
-                       attention. Defaults to decoder self-attention,
-                       which is the vLLM default generally
+            output: Optional output tensor.
+            output_scale: Optional output scale tensor.
+            output_block_scale: Optional output block scale tensor.
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index 34e059067d84d..7b6c426b0f851 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -561,7 +561,7 @@ def get_num_prefill_decode_query_kv_tokens(
 
     Raises:
         AssertionError: If the number of encoder tokens in `attn_metadata` 
-        is `None` when required for the calculations.
+            is `None` when required for the calculations.
     """
     num_prefill_query_tokens = 0
     num_decode_query_tokens = 0
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index c1213f7620a7a..302d3d7ea903f 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -471,17 +471,18 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
                 max_encoder_seq_len)
     
         Args:
+            layer: Attention layer instance.
             query: shape = [num_tokens, num_heads * head_size]
             key: shape = [num_tokens, num_kv_heads * head_size]
             value: shape = [num_tokens, num_kv_heads * head_size]
-            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
+            kv_cache: KV cache tensor with shape 
+                [2, num_blocks, block_size * num_kv_heads * head_size].
                 NOTE: kv_cache will be an empty tensor with shape [0]
                 for profiling run.
             attn_metadata: Metadata for attention.
-            attn_type: Select attention type, between encoder attention,
-                       decoder self-attention, or encoder/decoder cross-
-                       attention. Defaults to decoder self-attention,
-                       which is the vLLM default generally
+            output: Optional output tensor.
+            output_scale: Optional output scale tensor.
+            output_block_scale: Optional output block scale tensor.
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
@@ -644,7 +645,6 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
         for API spec.
 
         Args:
-            output: shape = [num_prefill_tokens, num_heads, head_size]
             query: shape = [num_prefill_tokens, num_heads, head_size]
             key: shape = [num_prefill_tokens, num_kv_heads, head_size]
             value: shape = [num_prefill_tokens, num_kv_heads, head_size]
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 4ec5a775f465c..cbfa4d7ff3c4c 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -352,7 +352,7 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
         with num_lookahead_slots.
 
         Args:
-            sequence_group (SequenceGroup): The sequence group to swap in.
+            seq_group (SequenceGroup): The sequence group to swap in.
             num_lookahead_slots (int): Number of lookahead slots used in 
                 speculative decoding, default to 0.
 
@@ -405,8 +405,6 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
 
         Args:
             seq_group (SequenceGroup): The sequence group to swap out.
-            num_lookahead_slots (int): Number of lookahead slots used in 
-                speculative decoding, default to 0.
 
         Returns:
             bool: Whether it's possible to swap out current sequence group.
@@ -420,7 +418,7 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
         swapping out the given sequence_group with num_lookahead_slots.
 
         Args:
-            sequence_group (SequenceGroup): The sequence group to swap out.
+            seq_group (SequenceGroup): The sequence group to swap out.
 
         Returns:
             List[Tuple[int, int]]: The mapping of swapping block from 
@@ -473,7 +471,7 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
         on to the 'device'.
 
         Args:
-            sequence_group (SequenceGroup): The sequence group to swap in/out.
+            seq_group (SequenceGroup): The sequence group to swap in/out.
             device (Device): device to swap the 'seq_group' on.
             status (SequenceStatus): The status of sequence which is needed
                 for action. RUNNING for swap out and SWAPPED for swap in
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 84ad2299b0655..4fb028627a8c4 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -486,10 +486,10 @@ class AsyncLLMEngine(EngineClient):
     _engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine
 
     def __init__(self,
-                 *args,
+                 *args: Any,
                  log_requests: bool = True,
                  start_engine_loop: bool = True,
-                 **kwargs) -> None:
+                 **kwargs: Any) -> None:
         if envs.VLLM_USE_V1:
             raise ValueError(
                 "Using V0 AsyncLLMEngine, but envs.VLLM_USE_V1=True. "
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index dbf8d3ba50146..cbd714c159eb5 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -644,10 +644,10 @@ class LLMEngine:
         Details:
             - Set arrival_time to the current time if it is None.
             - Set prompt_token_ids to the encoded prompt if it is None.
-            - Create `n` number of [Sequence][vllm.Sequence] objects.
-            - Create a [SequenceGroup][vllm.SequenceGroup] object
-              from the list of [Sequence][vllm.Sequence].
-            - Add the [SequenceGroup][vllm.SequenceGroup] object to the
+            - Create `n` number of [Sequence][vllm.sequence.Sequence] objects.
+            - Create a [SequenceGroup][vllm.sequence.SequenceGroup] object
+              from the list of [Sequence][vllm.sequence.Sequence].
+            - Add the [SequenceGroup][vllm.sequence.SequenceGroup] object to the
               scheduler.
 
         Example:
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 728ed8328d36d..8816ff56d6840 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -186,7 +186,7 @@ class LLM:
                                            CompilationConfig]] = None,
         logits_processors: Optional[list[Union[str,
                                                type[LogitsProcessor]]]] = None,
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         """LLM constructor."""
 
@@ -697,8 +697,8 @@ class LLM:
         Generate responses for a chat conversation.
 
         The chat conversation is converted into a text prompt using the
-        tokenizer and calls the [generate][] method to generate the
-        responses.
+        tokenizer and calls the [generate][vllm.LLM.generate] method to generate
+        the responses.
 
         Multi-modal inputs can be passed in the same way you would pass them
         to the OpenAI API.
@@ -1334,8 +1334,8 @@ class LLM:
 
     def wake_up(self, tags: Optional[list[str]] = None):
         """
-        Wake up the engine from sleep mode. See the [sleep][] method
-        for more details.
+        Wake up the engine from sleep mode. See the [sleep][vllm.LLM.sleep]
+        method for more details.
 
         Args:
             tags: An optional list of tags to reallocate the engine memory
diff --git a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
index 283e6095013d6..0fd62f0b6a7f1 100644
--- a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
@@ -461,7 +461,8 @@ class MinimaxToolParser(ToolParser):
                 i += 1
         return boundaries
 
-    def _extract_tool_args(self, tool_content: str, args_match) -> str:
+    def _extract_tool_args(self, tool_content: str,
+                           args_match: re.Match[str]) -> str:
         """
         Extract tool arguments from tool content.
         
diff --git a/vllm/model_executor/layers/lightning_attn.py b/vllm/model_executor/layers/lightning_attn.py
index 8ffc700ca5cde..0b87acc851208 100644
--- a/vllm/model_executor/layers/lightning_attn.py
+++ b/vllm/model_executor/layers/lightning_attn.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
 import torch
 from einops import rearrange
 
@@ -453,7 +455,14 @@ class _attention(torch.autograd.Function):
 lightning_attention_ = _attention.apply
 
 
-def lightning_attention(q, k, v, ed, block_size=256, kv_history=None):
+def lightning_attention(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    ed: torch.Tensor,
+    block_size: int = 256,
+    kv_history: Optional[torch.Tensor] = None
+) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Apply lightning attention algorithm 
     to compute attention efficiently.
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index dd54aebeb011e..c0fcacd1e6ee9 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -233,10 +233,10 @@ class LinearBase(CustomOp):
     Args:
         input_size: input dimension of the linear layer.
         output_size: output dimension of the linear layer.
-        bias: If true, add bias.
         skip_bias_add: If true, skip adding bias but instead return it.
         params_dtype: Data type for the parameters.
         quant_config: Quantization configure.
+        prefix: Prefix for parameter names.
         return_bias: If true, return bias together with outputs in forward pass.
     """
 
@@ -378,13 +378,14 @@ class MergedReplicatedLinear(ReplicatedLinear):
 
     Args:
         input_size: input dimension of the linear layer.
-        output_size: output dimension of the linear layer.
+        output_sizes: list of output dimensions of the linear layer.
         bias: If true, add bias.
         skip_bias_add: If true, skip adding bias but instead return it.
         params_dtype: Data type for the parameters.
         quant_config: Quantization configure.
         prefix: The name of the layer in the state dict, including all parents
                         (e.g. model.layers.0.qkv_proj)
+        return_bias: If true, return bias together with outputs in forward pass.
     """
 
     def __init__(
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 9784a8894472f..acdb2f89ce735 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -409,7 +409,7 @@ class EmbeddingOutput:
 
     Args:
         embedding: The embedding vector, which is a list of floats.
-        Its length depends on the hidden dimension of the model.
+            Its length depends on the hidden dimension of the model.
     """
     embedding: list[float]
 
@@ -447,7 +447,7 @@ class ClassificationOutput:
 
     Args:
         probs: The probability vector, which is a list of floats.
-        Its length depends on the number of classes.
+            Its length depends on the number of classes.
     """
     probs: list[float]
 
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 3c4c77aea5ed8..36b1b198bd5a5 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -147,18 +147,7 @@ class SequenceDataDelta(
 
 class SequenceData(msgspec.Struct,
                    omit_defaults=True):  # type: ignore[call-arg]
-    """Data associated with a sequence.
-
-    Args:
-        prompt_token_ids: The token IDs of the prompt.
-        output_token_ids: The token IDs of the output. Set to an empty list if
-            None.
-
-    Attributes:
-        prompt_token_ids: The token IDs of the prompt.
-        output_token_ids: The token IDs of the output.
-        cumulative_logprob: The cumulative log probability of the output.
-    """
+    """Data associated with a sequence."""
     # NOTE: we cannot use Union[list, array] because msgspec cannot support
     # union of 2 list types.
     _prompt_token_ids: array
@@ -256,10 +245,12 @@ class SequenceData(msgspec.Struct,
 
     @property
     def cumulative_logprob(self) -> float:
+        """The cumulative log probability of the output."""
         return self._cumulative_logprob
 
     @property
     def prompt_token_ids(self) -> tuple[int, ...]:
+        """The token IDs of the prompt."""
         return self._prompt_token_ids_tuple
 
     @prompt_token_ids.setter
@@ -277,6 +268,7 @@ class SequenceData(msgspec.Struct,
 
     @property
     def output_token_ids(self) -> tuple[int, ...]:
+        """The token IDs of the output."""
         return tuple(self._output_token_ids)
 
     @output_token_ids.setter
@@ -940,7 +932,7 @@ class SequenceGroupMetadata(
         omit_defaults=True):  # type: ignore[call-arg]
     """Metadata for a sequence group. Used to create `AttentionMetadata`.
 
-    Args:
+    Attributes:
         request_id: The ID of the request.
         is_prompt: Whether the request is at prompt stage.
         seq_data: The sequence data. (Seq id -> sequence data)
@@ -950,14 +942,14 @@ class SequenceGroupMetadata(
         do_sample: True if sampling is required. Sampling is not required when
             e.g., prefill is chunked, and the current iteration only computes
             query tokens for prefill, we don't need sampling.
-        token_chunk_size: The number of tokens to be processed (per sequence).
-            None if chunking is not required.
+        pooling_params: Pooling parameters.
         lora_request: LoRA request.
         computed_block_nums: The block numbers that are already computed,
             used in prefix caching.
         state: Internal state tied to this sequence group.
+        token_type_ids: Token type IDs.
         multi_modal_data: Multi modal data.
-        mm_processor_kwargs: Multimodal input processor / mapper overrides.
+        multi_modal_placeholders: Multi modal placeholders.
         encoder_seq_data: Optional sequence data for encoder prompt
                           (SequenceGroup.encoder_seq). Should be None
                           unless you are working with an encoder/decoder
@@ -1043,12 +1035,13 @@ class SequenceOutput(
         array_like=True):  # type: ignore[call-arg]
     """The model output associated with a sequence.
 
-    Args:
+    Attributes:
         parent_seq_id: The ID of the parent sequence (for forking in beam
             search).
         output_token: The output token ID.
         logprobs: The logprobs of the output token.
             (Token id -> logP(x_i+1 | x_0, ..., x_i))
+        output_embed: Optional output embedding tensor.
     """
     parent_seq_id: int
     output_token: int

From 227e231b55901be4e050d5a8f033e90f45cfba85 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Tue, 26 Aug 2025 20:33:16 +0200
Subject: [PATCH 044/112] [Docs] [V1] [Hybrid] Update docs to remove FlashInfer
 constraint for hybrid models (#23665)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 docs/usage/v1_guide.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index 7fc615d4c042f..64bd0d9bf5071 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -111,11 +111,10 @@ Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaFor
 
 Models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
 `Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`). Please note that
-these models currently require disabling prefix caching and using the FlashInfer attention backend in V1.
+these models currently require disabling prefix caching in V1.
 
 Hybrid models with mechanisms different to Mamba are also supported (e.g, `MiniMaxText01ForCausalLM`, `MiniMaxM1ForCausalLM`).
-Please note that these models currently require disabling prefix caching, enforcing eager mode, and using the FlashInfer
-attention backend in V1.
+Please note that these models currently require disabling prefix caching and enforcing eager mode in V1.
 
 #### Encoder-Decoder Models
 

From 98aa16ff41353e3e6c8a3c2f4e933a888dbce1cb Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Tue, 26 Aug 2025 14:49:06 -0400
Subject: [PATCH 045/112] [v1] Add cross-attention KV cache support for
 encoder-decoder models (#23664)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 vllm/multimodal/registry.py                  | 19 +++++++
 vllm/v1/core/kv_cache_coordinator.py         | 34 ++++++++----
 vllm/v1/core/kv_cache_manager.py             |  6 ++-
 vllm/v1/core/sched/scheduler.py              | 37 ++++++++++++-
 vllm/v1/core/single_type_kv_cache_manager.py | 56 +++++++++++++++++++-
 vllm/v1/kv_cache_interface.py                | 15 ++++++
 6 files changed, 153 insertions(+), 14 deletions(-)

diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index ded56cca80999..8cd9e5604872a 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -372,3 +372,22 @@ class MultiModalRegistry:
             )
 
         return dummy_data
+
+    def get_encdec_max_encoder_len(self, model_config: "ModelConfig") -> int:
+        """
+        Get the maximum length of the encoder input for encoder-decoder models.
+        """
+        if not model_config.is_encoder_decoder:
+            return 0
+        max_tokens = self.\
+            get_max_tokens_per_item_by_nonzero_modality(model_config)
+        if not max_tokens:
+            # TODO - this function assumes encoder-decoder models are
+            # multimodal. This will need to change when adding support for more
+            # than whisper.
+            return 0
+        assert len(max_tokens) == 1, "Encoder-decoder models are expected \
+            to implement the multimodal interface with at most one modality."
+
+        first_modality = next(iter(max_tokens))
+        return max_tokens[first_modality]
diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
index a0ea4d96015a2..f082ad00f2e35 100644
--- a/vllm/v1/core/kv_cache_coordinator.py
+++ b/vllm/v1/core/kv_cache_coordinator.py
@@ -6,7 +6,7 @@ from typing import Optional
 from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
 from vllm.v1.core.single_type_kv_cache_manager import (
-    FullAttentionManager, get_manager_for_kv_cache_spec)
+    CrossAttentionManager, FullAttentionManager, get_manager_for_kv_cache_spec)
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheSpec)
 from vllm.v1.request import Request
@@ -42,9 +42,10 @@ class KVCacheCoordinator(ABC):
             ) for i, kv_cache_group in enumerate(
                 self.kv_cache_config.kv_cache_groups))
 
-    def get_num_blocks_to_allocate(
-            self, request_id: str, num_tokens: int,
-            new_computed_blocks: tuple[list[KVCacheBlock], ...]) -> int:
+    def get_num_blocks_to_allocate(self, request_id: str, num_tokens: int,
+                                   new_computed_blocks: tuple[
+                                       list[KVCacheBlock], ...],
+                                   num_encoder_tokens: int) -> int:
         """
         Get the number of blocks needed to be allocated for the request.
 
@@ -54,14 +55,22 @@ class KVCacheCoordinator(ABC):
                 tokens that are already allocated).
             new_computed_blocks: The new computed blocks just hitting the
                 prefix caching.
+            num_encoder_tokens: The number of encoder tokens for allocating
+                blocks for cross-attention.
 
         Returns:
             The number of blocks.
         """
         num_blocks_to_allocate = 0
         for i, manager in enumerate(self.single_type_managers):
-            num_blocks_to_allocate += manager.get_num_blocks_to_allocate(
-                request_id, num_tokens, new_computed_blocks[i])
+            if isinstance(manager, CrossAttentionManager):
+                # For cross-attention, we issue a single static allocation
+                # of blocks based on the number of encoder input tokens.
+                num_blocks_to_allocate += manager.get_num_blocks_to_allocate(
+                    request_id, num_encoder_tokens, [])
+            else:
+                num_blocks_to_allocate += manager.get_num_blocks_to_allocate(
+                    request_id, num_tokens, new_computed_blocks[i])
         return num_blocks_to_allocate
 
     def save_new_computed_blocks(
@@ -79,8 +88,11 @@ class KVCacheCoordinator(ABC):
             manager.save_new_computed_blocks(request_id,
                                              new_computed_blocks[i])
 
-    def allocate_new_blocks(self, request_id: str,
-                            num_tokens: int) -> tuple[list[KVCacheBlock], ...]:
+    def allocate_new_blocks(
+            self,
+            request_id: str,
+            num_tokens: int,
+            num_encoder_tokens: int = 0) -> tuple[list[KVCacheBlock], ...]:
         """
         Allocate new blocks for the request to give it at least `num_tokens` 
         token slots.
@@ -89,12 +101,16 @@ class KVCacheCoordinator(ABC):
             request_id: The request ID.
             num_tokens: The total number of tokens that need a slot (including 
                 tokens that are already allocated).
+            num_encoder_tokens: The number of encoder tokens for allocating
+                blocks for cross-attention.
 
         Returns:
             The new allocated blocks.
         """
         return tuple(
-            manager.allocate_new_blocks(request_id, num_tokens)
+            manager.allocate_new_blocks(
+                request_id, num_encoder_tokens if isinstance(
+                    manager, CrossAttentionManager) else num_tokens)
             for manager in self.single_type_managers)
 
     def cache_blocks(self, request: Request, num_computed_tokens: int) -> None:
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index fd0bdb2c80fc5..b427a9c497fef 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -187,6 +187,7 @@ class KVCacheManager:
         new_computed_blocks: Optional[KVCacheBlocks] = None,
         num_lookahead_tokens: int = 0,
         delay_cache_blocks: bool = False,
+        num_encoder_tokens: int = 0,
     ) -> Optional[KVCacheBlocks]:
         """Add slots for a request with new tokens to append.
 
@@ -253,6 +254,7 @@ class KVCacheManager:
             request_id=request.request_id,
             num_tokens=num_tokens_need_slot,
             new_computed_blocks=new_computed_block_list,
+            num_encoder_tokens=num_encoder_tokens,
         )
 
         if num_blocks_to_allocate > self.block_pool.get_num_free_blocks():
@@ -273,7 +275,7 @@ class KVCacheManager:
                                                   new_computed_block_list)
 
         new_blocks = self.coordinator.allocate_new_blocks(
-            request.request_id, num_tokens_need_slot)
+            request.request_id, num_tokens_need_slot, num_encoder_tokens)
 
         # P/D: delay caching blocks if we have to recv from
         # remote. Update state for locally cached blocks.
@@ -292,7 +294,7 @@ class KVCacheManager:
 
     def free(self, request: Request) -> None:
         """Free the blocks allocated for the request.
-        We free the blocks in reverse order so that he tail blocks are evicted 
+        We free the blocks in reverse order so that the tail blocks are evicted
         first when caching is enabled.
 
         Args:
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 522b340b32aaf..14a914d8f2f0b 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -58,6 +58,7 @@ class Scheduler(SchedulerInterface):
         self.parallel_config = vllm_config.parallel_config
         self.log_stats = log_stats
         self.structured_output_manager = structured_output_manager
+        self.is_encoder_decoder = vllm_config.model_config.is_encoder_decoder
 
         # include_finished_set controls whether a separate set of finished
         # request ids should be included in the EngineCoreOutputs returned
@@ -83,6 +84,9 @@ class Scheduler(SchedulerInterface):
             assert len(self.kv_cache_config.kv_cache_groups) == 1, (
                 "Multiple KV cache groups are not currently supported "
                 "with KV connectors")
+            assert not self.is_encoder_decoder, (
+                "Encoder-decoder models are not currently supported "
+                "with KV connectors")
             self.connector = KVConnectorFactory.create_connector(
                 config=self.vllm_config, role=KVConnectorRole.SCHEDULER)
 
@@ -431,6 +435,22 @@ class Scheduler(SchedulerInterface):
                                               == 0 else
                                               self.num_lookahead_tokens)
 
+                # Determine if we need to allocate cross-attention blocks.
+                if self.is_encoder_decoder and request.has_encoder_inputs:
+                    # TODO(russellb): For Whisper, we know that the input is
+                    # always padded to the maximum length. If we support other
+                    # encoder-decoder models, this will need to be updated if we
+                    # want to only allocate what is needed.
+                    assert ("whisper"
+                            in self.vllm_config.model_config.model.lower()), (
+                                "Whisper is the only supported "
+                                "encoder-decoder model.")
+                    num_encoder_tokens = MULTIMODAL_REGISTRY.\
+                        get_encdec_max_encoder_len(
+                        self.vllm_config.model_config)
+                else:
+                    num_encoder_tokens = 0
+
                 new_blocks = self.kv_cache_manager.allocate_slots(
                     request,
                     num_new_tokens + num_external_computed_tokens,
@@ -438,6 +458,7 @@ class Scheduler(SchedulerInterface):
                     new_computed_blocks,
                     num_lookahead_tokens=effective_lookahead_tokens,
                     delay_cache_blocks=load_kv_async,
+                    num_encoder_tokens=num_encoder_tokens,
                 )
 
                 if new_blocks is None:
@@ -703,7 +724,21 @@ class Scheduler(SchedulerInterface):
                 # The encoder input is not needed in this step.
                 break
 
-            if start_pos + num_encoder_tokens <= num_computed_tokens:
+            if self.is_encoder_decoder and num_computed_tokens > 0:
+                assert start_pos == 0, (
+                    "Encoder input should be processed at the beginning of "
+                    "the sequence when encoder-decoder models are used.")
+                # Encoder input has already been computed
+                # The calculation here is a bit different. We don't turn encoder
+                # output into tokens that get processed by the decoder and
+                # reflected in num_computed_tokens. Instead, start_pos reflects
+                # the position where we need to ensure we calculate encoder
+                # inputs. This should always be 0 to ensure we calculate encoder
+                # inputs before running the decoder.  Once we've calculated some
+                # decoder tokens (num_computed_tokens > 0), then we know we
+                # already calculated encoder inputs and can skip here.
+                continue
+            elif start_pos + num_encoder_tokens <= num_computed_tokens:
                 # The encoder input is already computed and stored
                 # in the decoder's KV cache.
                 continue
diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
index 82e0292522b9a..f0af92122958c 100644
--- a/vllm/v1/core/single_type_kv_cache_manager.py
+++ b/vllm/v1/core/single_type_kv_cache_manager.py
@@ -8,8 +8,9 @@ from vllm.utils import cdiv
 from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
 from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec,
-                                        FullAttentionSpec, KVCacheSpec,
-                                        MambaSpec, SlidingWindowSpec)
+                                        CrossAttentionSpec, FullAttentionSpec,
+                                        KVCacheSpec, MambaSpec,
+                                        SlidingWindowSpec)
 from vllm.v1.request import Request
 
 
@@ -552,11 +553,62 @@ class MambaManager(SingleTypeKVCacheManager):
         return new_blocks
 
 
+class CrossAttentionManager(SingleTypeKVCacheManager):
+    """Manager for cross-attention KV cache in encoder-decoder models."""
+
+    def save_new_computed_blocks(
+            self, request_id: str,
+            new_computed_blocks: list[KVCacheBlock]) -> None:
+        # We do not cache blocks for cross-attention to be shared between
+        # requests, so  `new_computed_blocks` should always be empty.
+        assert len(new_computed_blocks) == 0
+
+    def cache_blocks(self, request: Request, num_tokens: int) -> None:
+        # We do not cache blocks for cross-attention to be shared between
+        # requests, so this method is not relevant.
+        raise ValueError("Should not be called as prefix caching is disabled.")
+
+    def get_num_common_prefix_blocks(self, request_id: str,
+                                     num_running_requests: int) -> int:
+        # Cross-attention blocks contain request-specific encoder states
+        # and are not shared between different requests
+        return 0
+
+    @classmethod
+    def find_longest_cache_hit(
+        cls,
+        block_hashes: list[BlockHash],
+        max_length: int,
+        kv_cache_group_ids: list[int],
+        block_pool: BlockPool,
+        kv_cache_spec: KVCacheSpec,
+        use_eagle: bool,
+    ) -> tuple[list[KVCacheBlock], ...]:
+        assert isinstance(kv_cache_spec, CrossAttentionSpec), (
+            "CrossAttentionManager can only be used for cross-attention groups"
+        )
+        # Cross-attention does not benefit from prefix caching since:
+        # 1. Encoder states are unique per request (different audio/image
+        #    inputs)
+        # 2. Encoder states are computed once per request, not incrementally
+        # 3. No reusable prefix exists between different multimodal inputs
+        # Return empty blocks to indicate no cache hits
+        raise NotImplementedError(
+            "CrossAttentionManager does not support caching")
+
+    def remove_skipped_blocks(self, request_id: str,
+                              num_computed_tokens: int) -> None:
+        # Cross-attention blocks represent encoder states which are needed
+        # for the entire decoding process, so no blocks should be skipped
+        pass
+
+
 spec_manager_map: dict[type[KVCacheSpec], type[SingleTypeKVCacheManager]] = {
     FullAttentionSpec: FullAttentionManager,
     SlidingWindowSpec: SlidingWindowManager,
     ChunkedLocalAttentionSpec: ChunkedLocalAttentionManager,
     MambaSpec: MambaManager,
+    CrossAttentionSpec: CrossAttentionManager,
 }
 
 
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index ed8e0bf798988..a3e4d393e4d20 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -11,6 +11,7 @@ from typing_extensions import Self
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.utils import cdiv, get_dtype_size
 
 logger = init_logger(__name__)
@@ -211,6 +212,20 @@ class EncoderOnlyAttentionSpec(AttentionSpec):
         return 0
 
 
+@dataclass(frozen=True)
+class CrossAttentionSpec(AttentionSpec):
+    """
+    KV cache spec for cross-attention layers in encoder-decoder models.
+    """
+
+    def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
+        # For cross-attention, we need to cache encoder states
+        # Get encoder length (e.g., 1500 for Whisper).
+        max_encoder_len = MULTIMODAL_REGISTRY.\
+            get_encdec_max_encoder_len(vllm_config.model_config)
+        return cdiv(max_encoder_len, self.block_size) * self.page_size_bytes
+
+
 @dataclass
 class KVCacheTensor:
     """

From 9715f7bb0fd70fa3dac6f35c824e90e58f0086ce Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 27 Aug 2025 03:01:25 +0800
Subject: [PATCH 046/112] [Bugfix] Fix incorrect original shape in hashing
 (#23672)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Lukas Geiger <lukas.geiger94@gmail.com>
---
 tests/multimodal/test_hasher.py |  7 ++++---
 vllm/multimodal/hasher.py       | 10 ++++++++--
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/tests/multimodal/test_hasher.py b/tests/multimodal/test_hasher.py
index 75a233c2567cb..2751e38760e17 100644
--- a/tests/multimodal/test_hasher.py
+++ b/tests/multimodal/test_hasher.py
@@ -45,10 +45,11 @@ def test_hash_collision_image_transpose():
     assert hasher.hash_kwargs(image=image1) != hasher.hash_kwargs(image=image2)
 
 
-def test_hash_collision_tensor_shape():
+@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16])
+def test_hash_collision_tensor_shape(dtype):
     # The hash should be different though the data is the same when flattened
-    arr1 = torch.zeros((5, 10, 20, 3))
-    arr2 = torch.zeros((10, 20, 5, 3))
+    arr1 = torch.zeros((5, 10, 20, 3), dtype=dtype)
+    arr2 = torch.zeros((10, 20, 5, 3), dtype=dtype)
 
     hasher = MultiModalHasher
     assert hasher.hash_kwargs(data=arr1) != hasher.hash_kwargs(data=arr2)
diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py
index 479961776a6a0..3708dc7065ba1 100644
--- a/vllm/multimodal/hasher.py
+++ b/vllm/multimodal/hasher.py
@@ -45,16 +45,22 @@ class MultiModalHasher:
         if isinstance(obj, torch.Tensor):
             tensor_obj: torch.Tensor = obj.cpu()
             tensor_dtype = tensor_obj.dtype
+            tensor_shape = tensor_obj.shape
+
+            # NumPy does not support bfloat16.
+            # Workaround: View the tensor as a contiguous 1D array of bytes
             if tensor_dtype == torch.bfloat16:
                 tensor_obj = tensor_obj.contiguous()
                 tensor_obj = tensor_obj.view(
                     (tensor_obj.numel(), )).view(torch.uint8)
+
                 return cls.item_to_bytes(
                     "tensor", {
                         "original_dtype": str(tensor_dtype),
-                        "original_shape": tuple(tensor_obj.shape),
-                        "data": tensor_obj.numpy()
+                        "original_shape": tuple(tensor_shape),
+                        "data": tensor_obj.numpy(),
                     })
+
             return cls.item_to_bytes("tensor", tensor_obj.numpy())
         if isinstance(obj, np.ndarray):
             # If the array is non-contiguous, we need to copy it first

From c37c0af990ed1f3623448b82903c1ae52e84cc05 Mon Sep 17 00:00:00 2001
From: Jiangyun Zhu <riverclouds.zhu@qq.com>
Date: Wed, 27 Aug 2025 03:31:20 +0800
Subject: [PATCH 047/112] [Misc] Fix comments in `tests/kernels/quantization`
 (#23675)

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
---
 tests/kernels/quantization/test_awq_triton.py          | 2 +-
 tests/kernels/quantization/test_cutlass_2of4_sparse.py | 2 +-
 tests/kernels/quantization/test_cutlass_scaled_mm.py   | 2 +-
 tests/kernels/quantization/test_cutlass_w4a8.py        | 2 +-
 tests/kernels/quantization/test_machete_mm.py          | 2 +-
 tests/kernels/quantization/test_marlin_gemm.py         | 2 +-
 tests/kernels/quantization/test_triton_scaled_mm.py    | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/kernels/quantization/test_awq_triton.py b/tests/kernels/quantization/test_awq_triton.py
index 96797e85bd125..9354495642b28 100644
--- a/tests/kernels/quantization/test_awq_triton.py
+++ b/tests/kernels/quantization/test_awq_triton.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the AWQ Triton kernel.
 
-Run `pytest tests/kernels/test_awq_triton.py`.
+Run `pytest tests/kernels/quantization/test_awq_triton.py`.
 """
 import pytest
 import torch
diff --git a/tests/kernels/quantization/test_cutlass_2of4_sparse.py b/tests/kernels/quantization/test_cutlass_2of4_sparse.py
index 878f66647e19e..ae61b3b3a28a8 100644
--- a/tests/kernels/quantization/test_cutlass_2of4_sparse.py
+++ b/tests/kernels/quantization/test_cutlass_2of4_sparse.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for sparse cutlass kernels
 
-Run `pytest tests/kernels/test_semi_structured.py`.
+Run `pytest tests/kernels/quantization/test_cutlass_2of4_sparse.py`.
 """
 
 import pytest
diff --git a/tests/kernels/quantization/test_cutlass_scaled_mm.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py
index a15decdf6f827..65320509e173f 100644
--- a/tests/kernels/quantization/test_cutlass_scaled_mm.py
+++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for cutlass kernels
 
-Run `pytest tests/kernels/test_cutlass.py`.
+Run `pytest tests/kernels/quantization/test_cutlass_scaled_mm.py`.
 """
 import random
 
diff --git a/tests/kernels/quantization/test_cutlass_w4a8.py b/tests/kernels/quantization/test_cutlass_w4a8.py
index 7832f8179d0ec..f659408efe8c6 100644
--- a/tests/kernels/quantization/test_cutlass_w4a8.py
+++ b/tests/kernels/quantization/test_cutlass_w4a8.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the CUTLASS W4A8 kernel.
 
-Run `pytest tests/kernels/test_cutlass_w4a8.py`.
+Run `pytest tests/kernels/quantization/test_cutlass_w4a8.py`.
 """
 
 from dataclasses import dataclass
diff --git a/tests/kernels/quantization/test_machete_mm.py b/tests/kernels/quantization/test_machete_mm.py
index 0e09661c955e4..50584f3f82d4c 100644
--- a/tests/kernels/quantization/test_machete_mm.py
+++ b/tests/kernels/quantization/test_machete_mm.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the machete kernel.
 
-Run `pytest tests/kernels/test_machete_mm.py`.
+Run `pytest tests/kernels/quantization/test_machete_mm.py`.
 """
 
 import math
diff --git a/tests/kernels/quantization/test_marlin_gemm.py b/tests/kernels/quantization/test_marlin_gemm.py
index ad077e0b94732..0be020085bfa4 100644
--- a/tests/kernels/quantization/test_marlin_gemm.py
+++ b/tests/kernels/quantization/test_marlin_gemm.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the marlin kernel.
 
-Run `pytest tests/kernels/marlin/test_marlin_gemm.py`.
+Run `pytest tests/kernels/quantization/test_marlin_gemm.py`.
 """
 import pytest
 import torch
diff --git a/tests/kernels/quantization/test_triton_scaled_mm.py b/tests/kernels/quantization/test_triton_scaled_mm.py
index 24245663fb1d6..d8cfb5710dbad 100644
--- a/tests/kernels/quantization/test_triton_scaled_mm.py
+++ b/tests/kernels/quantization/test_triton_scaled_mm.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the triton_scaled_mm kernel
 
-Run `pytest tests/kernels/test_triton_scaled_mm.py`.
+Run `pytest tests/kernels/quantization/test_triton_scaled_mm.py`.
 """
 import importlib
 from typing import Optional

From 9816b81f5f9f85391dc30ae5f48185542dfec2af Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Wed, 27 Aug 2025 03:46:52 +0800
Subject: [PATCH 048/112] [Model] Enable video support for InternVL3.5 models
 (#23658)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 docs/models/supported_models.md                        |  4 ++--
 tests/models/multimodal/processing/test_common.py      |  3 +++
 .../models/multimodal/processing/test_tensor_schema.py |  7 ++++++-
 tests/models/registry.py                               |  5 ++++-
 vllm/model_executor/models/internvl.py                 | 10 +++++++---
 5 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 4763f2281d323..74f3a9d1cdb56 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -627,7 +627,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
 | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
 | `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
 | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ | ✅︎ |
 | `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ |
@@ -701,7 +701,7 @@ Some models are supported only via the [Transformers backend](#transformers). Th
     - There's no PLE caching or out-of-memory swapping support, as described in [Google's blog](https://developers.googleblog.com/en/introducing-gemma-3n/). These features might be too model-specific for vLLM, and swapping in particular may be better suited for constrained setups.
 
 !!! note
-    Only `InternVLChatModel` with Qwen2.5 text backbone (`OpenGVLab/InternVL3-2B`, `OpenGVLab/InternVL2.5-1B` etc) has video inputs support currently.
+    For `InternVLChatModel`, only InternVL2.5 with Qwen2.5 text backbone (`OpenGVLab/InternVL2.5-1B` etc), InternVL3 and InternVL3.5 have video inputs support currently.
 
 !!! note
     To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index a604d11f0e769..74ca10d32609a 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -286,6 +286,9 @@ def _test_processing_correctness_one(
     "internlm/Intern-S1",
     "OpenGVLab/InternVL2-1B",
     "OpenGVLab/InternVL3-1B",
+    "OpenGVLab/InternVL3_5-1B",
+    "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview",
+    "OpenGVLab/InternVL3_5-30B-A3B",
     "Kwai-Keye/Keye-VL-8B-Preview",
     "moonshotai/Kimi-VL-A3B-Instruct",
     "meta-llama/Llama-4-Scout-17B-16E-Instruct",
diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py
index 79164f02c3398..2d8cd49edc73b 100644
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -38,7 +38,12 @@ ARCH_NEEDS_EXTRAS = [
     "MiniCPMV",
     "PaliGemmaForConditionalGeneration",
 ]
-REPO_ID_TO_SKIP = {"nm-testing/pixtral-12b-FP8-dynamic": "duplicated test"}
+REPO_ID_TO_SKIP = {
+    "nm-testing/pixtral-12b-FP8-dynamic": "duplicated test",
+    # FIXME(Isotr0py): enable GPT-OSS based InternVL3.5 model
+    # after support PP for GPT-OSS
+    "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview": "Broken model",
+}
 
 ImageInput = list[Image.Image]
 VideoInput = Union[list[Image.Image], list[np.ndarray],
diff --git a/tests/models/registry.py b/tests/models/registry.py
index b34c6f2e5dc84..20c7c3af67764 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -422,7 +422,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                         trust_remote_code=True),  # noqa: E501
     "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
                                          extras={"2B": "OpenGVLab/InternVL2-2B",
-                                                 "3.0": "OpenGVLab/InternVL3-1B"},  # noqa: E501
+                                                 "3.0": "OpenGVLab/InternVL3-1B",   # noqa: E501
+                                                 "3.5-qwen3": "OpenGVLab/InternVL3_5-1B",   # noqa: E501
+                                                 "3.5-qwen3moe": "OpenGVLab/InternVL3_5-30B-A3B",   # noqa: E501
+                                                 "3.5-gptoss": "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview"},  # noqa: E501
                                          trust_remote_code=True),
     "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
                                                     trust_remote_code=True),
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index da8ad8396725d..b09ed7bbe72a3 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -855,9 +855,13 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo):
 
     def get_video_token(self) -> Optional[str]:
         text_model_type = self.get_hf_config().get_text_config().model_type
-        if text_model_type == "qwen2":
-            return "<|video_pad|>"
-        return None
+        video_token_map = {
+            "qwen2": "<|video_pad|>",
+            "qwen3": "<|video_pad|>",
+            "qwen3_moe": "<|video_pad|>",
+            "gpt_oss": "<|reserved_200000|>",
+        }
+        return video_token_map.get(text_model_type)
 
     def get_num_frames_with_most_features(
         self,

From d696f86e7bdf23a6a4c212fee3522a589a460b24 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Tue, 26 Aug 2025 13:19:05 -0700
Subject: [PATCH 049/112] [doc] Hybrid KV Cache Manager design doc (#22688)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .../basic_grouping_example.png                | Bin 0 -> 24096 bytes
 .../hybrid_kv_cache_manager/full_attn.png     | Bin 0 -> 4120 bytes
 .../hybrid_kv_cache_manager/memory_layout.png | Bin 0 -> 63113 bytes
 .../hybrid_kv_cache_manager/overview.png      | Bin 0 -> 39501 bytes
 .../hybrid_kv_cache_manager/sw_attn.png       | Bin 0 -> 4560 bytes
 docs/design/hybrid_kv_cache_manager.md        | 245 ++++++++++++++++++
 6 files changed, 245 insertions(+)
 create mode 100644 docs/assets/design/hybrid_kv_cache_manager/basic_grouping_example.png
 create mode 100644 docs/assets/design/hybrid_kv_cache_manager/full_attn.png
 create mode 100644 docs/assets/design/hybrid_kv_cache_manager/memory_layout.png
 create mode 100644 docs/assets/design/hybrid_kv_cache_manager/overview.png
 create mode 100644 docs/assets/design/hybrid_kv_cache_manager/sw_attn.png
 create mode 100644 docs/design/hybrid_kv_cache_manager.md

diff --git a/docs/assets/design/hybrid_kv_cache_manager/basic_grouping_example.png b/docs/assets/design/hybrid_kv_cache_manager/basic_grouping_example.png
new file mode 100644
index 0000000000000000000000000000000000000000..185f61e6a3edeefb7f3c2bc5bcb942af3408da0e
GIT binary patch
literal 24096
zcma&O1yog0+ckPnB&0)-kPd0-ZlqJXr5jYbLqb}*ySuxTknV2j?(To#cfWhb|9<1%
z|DNH{!}jcb)?Ux^%xBKI)+SJ1Rt)Ji?rR7Hf+Qg>@(BWYAqn0$!My-~zd@zYgg~^n
zBt!(2oRSYx?6ok5pSrg={%#*)er%M&lz1az`BCY^hl)292u+%32qEF=YHD6DUW_d+
zF1{06SbTg`YwL#p;q_(fBx!5yaV90j$!ek1(RhjZyC5pG01+x2G*j#6^cn~p^b2nR
zQ9{$De}|Cu$ZR$?HWpi49VZKw-Q3(*Frjg<kdTmu)A;k6nqZ6+F}PxowpBQzWsGof
zadp}}4z{=7zkh$-54G-Tx!4#ZI`@2?7dJFxV`D#ccS>4HG-~>~x;FNv%f5a4#u5#O
zgT=|oiG_t_=Qy2iX)csk3AIQaP+5Rj*zD<4=DfDP4h{Jc94r_A!P~r&$Wy}Al{>xT
zfB#<jjR)^i^5<{C2@p7g@fINZ`4x`E=KpkHEiEeqiI<mGT3Y(Mh`6+LaejVzWo2e!
z_LHiTlG63{^@k51)HF1Lf`X6<ILXL-BO*qB{qkb+{l;W1BO_yNeONIDLPbPFL&MB$
zJDT}1+FMXiFd-p<goH#zMTG<3`j@Yjl@%W!-^Im6V`C#17nge&I-IY?bm_rzo9DzU
zM|=C5>uV)tWhxpPJRF>e4hgbPpFe+gaH#e2`ZbWi{0bTxW|)$gIHRFqc47h(A`eET
zq%=Y4lbATCCe0iX6BCn`CU*lnFf@dWjBIRdoXTnu>+9QjgwDpw>i%$B)7q*dCl`L|
z6Bixr_HY{$5%KkSofIa#`wo1os%lYwzP!#CO!UC@l@&cby-`Pf1c-}^OM6Gh9rWqh
zSy5r(3&`C3yt|tlh--CC%^g2~7Rg<I9K+<~B$yNMed}MK1zX+kV$;$XXlS@Crf3mi
zm_MJUrV{AtF8B3Gg0P+w3+|ldAD^BU78O0UXg~|zKRh5HAdr)jn+&H8{u<E~EX>b`
zgM-`N-i~K9__N*{1v|X8(H{rKfk7#ojd*@}85<R)_~}z7;`!Mb20A)OBHRqE=;~^g
z4<o;hDt-+Q5BK$%A9Hkvy-Q*;di|zzgq5_(0u2ofA0HnC0i+dp1H$rMWN2Uj6BV`I
zZU>z+yS%)7Sn+(VJDf2_TUWO%GgIHhL=1UnXGgvKvv+y<aiMZic6K(1Oj2UvEhQ~2
ztw=b%ZfkOIu*KEkqC3g{Tn$1%M`+{z?X8QEk&(K(x~8Tk6ciL59v(@=4<a~%aCRmp
zbaZr$R<{O!e;9UlM-h=uq8lX@mDjId(`(hEF7kd|6?tm=^7->$(_V`4xjAYYnxecs
zF=XfaYeo!=C=l6#0+1C&Wo1WR@E8~ve!jlBGD)?jqa+j*xy&X*=_^Z1OCZB8_h)T4
z2lR}L+`+))4H31S_4G1ALPBzLb8Bi?xw-FR=`?ReI6NPDAVfSo&7O0+yStC7_9v@d
zrnMLp(m!KkYbz?y(9w6(4s`oU3kxH`H^;<iSLnY&K^d+z>K7+N_aPD?8J(Nc(9&`{
zn5)Um%sks1Bt?aLh+=1BGdDAXhlih?nGu(g8ct#_D=I?ZHMFsjbLIls;mOccRAez#
ztiG_YFf}y=Ze1}?*2>aS1jYUSIwB=y5yY~+-6t?`2L${(=K9)NM^DdNR8#?+x9I5Y
z?d_+VgLr+v>gzdVWrxJV2|}egtp?@;F(`9oQ@EV)O_%)J;SO;FMLwyh@Y!vnHo6}F
z{oNZyndMpd=;Z8ta&n?qp(c5@d2le;-+$LcmPOCa!=nW<BDm{QVx4B4bzo4Cijoo$
z92f<dREL#zzeh9GQY}vRTf2#B5mC_>kWZgJjR>={vVy^@(K^r$Bt=C<H8;DNm{9r<
zy?XUZjdp5y7)9`B61xp37gc5DjL}#I-O`*KR&l=Ppn%UMu}gwXMnpm?DJc<CU`x2z
zo35&;h#1847Wlid!3)CDxcB!j5d#DNXCYSBnovCU%=y&RR1iQ=V*r`JEDQ||nbucT
zRmI28Jjb`e&#&wL=1c((6&?KrL|jIOoG>&zd~0RpCdGNL!}IB(dvGu%B_))o$LAFS
zC|Z!1XwU`4#h6%Fhi8uuQnLZb??8}0rbSImC?UeE9UX-e^#CYtXlNMS%C4-83=coL
zIoo=OI-0E#6B2q6IyE;}SXe0em}YKnuG8dj5dOlS{O<N*FF84RJ7(wbFqPAG^YUOG
z4cg4wdUI=w65H|q`YRe*1k^GLXgqQB+WLBWP>`R`w_I<#g$zpM<;#~hr;*}h?Ck8a
zva+;T-1=vt-&^Z`{P^+n=g-Q@$|y>?)e|SBsZ?Iq{ey#We9ligI!pW#c6R3=yi!t9
zP!Q-BzrnSamOR$}gn~LS1r6)V>3T0XcRWvSetw>i&#fL*fF~a)is0~Y0JcwI@3_GY
zz(KcHnGCa8FPPfcc>ew6-Q3K>G6!G)R9G61v%ZZ@oBQ2mb~f2JWJX3tfT?>^C9dEa
z-Qk4p?(U<<0K-j8Ok7>>w3{4q^76u0H*oDil8lXyPnT#099qv-lC!hNo#OZMfFv5v
zmDw6jQ&myPMU<14o|u|?)VAo|j*O2t1p@(n!OCh;EGa9CbgsCsxA*2^PnDay>9H3a
zX3%N=!IqI8$_!9)JHu=|mz9NOq5i7@t)Zs6`eK7EtQ7-*@|Dd^(S9R6y{~q5Ahx1e
z2}wy{Y)|wU*w}HP+3-=yZmxC(gRr6y^5NT`fNA>u`!~3y^`D1_hq<{Frv~W-1?mb4
zqx^b8LPDaVFdA|%Ahhb0WT4R{B_)ZIfxsGrQa7z{Y;;jpR(5o}GKDWKE<V4y;s<$)
z75obF76-@A-#-$s_1LKyl=EzrNkF8sy80}@oKl^ZAX(`*sHlJc{{8j)HxvXF4ec3Q
z0|L5Ddto3G6BE*jEb#8K(b3UgIPhKcK(3b*7msX-h=>@1PO77$^AH6K12Z=}n=#74
z!Lh%;uhZ-V1=-%&=>-^Zkvrk#<pt6-OVvLx&}^~MetYuzd<TPuX0KGcX~Z-vER2xf
zLj*=cVxqUV7jTWmCdX*+Jb)XHhsgl}upkLJkN<??-CZ4l)OaLpl7;cUEG75w@R*yM
zJ2*J-_xG2OkN{l|bg*bN5>ir=!NjhiA$lsRK%$;Vaw%}3oWeqC28K50vrU8UFsSFg
znVgbhGL)?T=@SVJ&E4rnKbVdyhW@UuuDLlbHa0doy3m*y`@uw3P|%=zYG`P9dwYW%
z+1uWRKzLlvd*<eDCyP`8B{DNJ18FRl#_8$lSxuduk@0oCCjuZx)@Dg&rjV2r4K1y-
zu5OyQx1gLH0|!U^>o{veLn9+bX6C4;dytOB#l?VxJQ(2O;B0Jd+00cJ$T@xbgicDj
z28Iqkh)u5*pP0zyU`|d(2EJ}6g}b|FW?oL32=$Yq;^@(MgYA~}LOoh=S5R>9l^h?q
zDUgFu5Nc{_&~2FY+Mz+Uod3+rGZ7H*ijJO8>tSPLOyIPofTXv!K5ma@y+t%WTxfuJ
z$t1D8<8zD0ADo`Pa^xieT@uuh=i@!+Mqm^b6&0uB;X(|Js%T4pRb}Pa@82t6!nV2J
z`NhOc{`s>xQK0zsE5H#n)iNFJ78m=cYS3t+C}hknEf)ag*q^N`({9Sm$?@x0`NFZ_
zkAw#@2Lb_619zte+7~nPnW>qqw6vj#2>{30I-9hHh8xg4K<4!v^?@okGc~2vZFPIE
z4q{vY8Xx~7h~fJ({+BO>)9S``0U^LfM_=9CjE#${GZ`kJr>D0&>?|&(1;GD}#o74=
z>L~+YQ1t1Cx6;<u^bjIa(yXGQ{^@B|F)=Y^<%@Pd1QC*xqshxQT)>s>?d`$v0RgP2
zsJOhm1ZCw4=vi18vQZs=0umk@hziJzV5X5je<&CjY<~N^iuMNV1$-BcO7S&KmvC7^
z7)f+!6zHMcj)x+35@eAT6(=B^Kc84-ML&N0YHPdP>W&wByfaP-2*Ulnn;kn5kyk`S
zMR74As}z`bv5tp_2PlYX{~K};NC1x8Yy{zT<>fy@LPQ)#^FV@P38Dg!6BZU`X>Msr
z@$=j03dV->0bm7&ow)+;fQN^N+hJb-we$CHK}E%=1&+h*?fjAwUUO^<dC@P^TH4w-
zH#bzmzJ7k%T3Rp?nYp>5vm8`(babE|0I`!x<9oix;9wfShl-8nXJ;euC`St$0GS01
z4S=4hvGJeY-h_yVlcs7E0v>@hhtuyOn;iit2BRE-prEj;7r+l!Hbu_o+i0Yuq`TSU
z0f&dybaW+PPKe+t^*a0?L!;(vtrX?u1@N>%zYP>g<aNDV=?FlB=JdGdFk03W74-+y
z(S-|4Bp_}NQE9xcDxhROeLDT+{Sp8KIwodyb#-<~xnVCl1VTut3;6bXYR}Vkcm{@}
zY>6m`!v*J+_E#W%-@Qu#Uk)t@6cK<-fEBxYd%x3gDaXghao)ZKDJqK02xgU$QLqYd
z-)B^gh`^znh4VQ-KewDI_qL>4SX><a!pK+w_(Nn?Lt`Tt)1{oXjSau^>3VD|7PKIM
z)!JG%h?fWnuk$H1q}+Z_g`dBzNcCAaadB|mx9kIs49Ii(k@5>~uf}(8-n^Neodsi)
zkdy@I4JI=XK+bR~@Aov;bkk(Aj^DpW+YVHPg!DT@qgc+8MG8tv2K)NH+1r0~WPBS^
zQ*!|t?C4@-Bs!qz0LKsJ>#~SJ*U-|^0%A|h5gz(QggD?PA`C3EH7E~cVw!w+0uBYB
z!Jqt-3xEangYW$D0|H5(PGO<}w+|qdhhTYOAvr$Y>0(!zB{?l^d1mJAA~HPu%E}6$
z4v3AKIy$L8e~yff3aU>(I)2vI7xJ)TCyEe<cqJ#}S3ACjq!$<Si1+)~)ztwmxa)1K
zr$@}d;CXj>046RhXIB@<ONbX3IRIlA$lvw#?6NX|GR}ZrRDXqm{AzF4H!%2aU8d9G
z(&3Mk#^+vHQK9$wbNbwz9JBF&04f;=hf{AP`CCN)dtV5|da1d=W|<G1eC`bt2Prql
zKoa{A$UYw*pSJPYv9Z;|Ll02DQ2ali{F@w)R8&FV=Dvb2D=udF@S(h_ire$iO;vRo
zP$E$G<6~oBQh;n3;n>F3F*7SjOC!{HRRQpBY6=+%2{d+*wPvT2RRDckLn+LRjDf+y
zd*j=@`f$)f1_o=O!;M}J1CZ?N>uYtr0t&|R;r0R)5)kB8rlv&QHkOw6*C(WnbYx_P
zW@drb%Z>I@GBTl+)?Y7nCvljK--C|0yXzki0L0woRnaXT5JRBbgM0@<a;eSJ#=@e1
zXvo^iYOu4@_mM`SXC8Fk_36337#ct#kq{9<e+~%^&C1LKZ@^g}-VOj-sjshZXKxP$
z0pbc^0v@}y2tUW8=)vJ)({i(OR%t1p!JiO-vl3AhP>}e91bAp6!1FXT5&_$S2nY)s
z02qKp#s?k&k*Wijq`Pa0Pf)l?>~^^(b`>u$!Dmm;hXL&dsMYpmr5)5fyY&Jb#MI18
z&)7KE&Plt<7^#8s`#x@%0|N0&*2l<xAymWuSUPKKYXEJ4TmirW(I%l3<mRWcwXm>w
z2?4S`vIEq{XecNwK!Pf%t7kFXUp;+4K0B+cu0{<Msm}5dAz@`_M|l1EPghsOvIQQS
z<=f6?&?SD>CB(;rq6OvoN9DZi`1lyu0zcz^0j2(dnHd2QQDi=URA53EZBdjk1Vjt~
z8vvaU^GzhW>CR3=kf&f^`Oc*1fgoKR934GD_5i>ejR!mou*~gRpf`a40u;lKE#jky
zh@FE2P~FS*Utd}U-2+k+D5BA5x3gV%u)n|1=Gj(SN<W965+ARoqXV$wXn!AE%LEa|
z_Tko^77GOP@pwfT1Q953z-5B588UkTf$i_^?*8@bv$b`YKbS_Kjf{+qBM0$Z1ZC6s
z9>4`alTG2WH&=>2J3V~?=~0mcvDVSy10YLHU6_~G-SPYa{S5)~1#kdA1Z;3#%6&3W
z?dkA=BCGRtHYoTU0tr>MwGIGb^YSRY1vtL00v@Re+Hbxo3?$wCDV&gBRE&rg@HpTj
zfiZy(1HE8lZcfr|2}B{?c#Xw$Pnw`qENw(&q@;wz$?-90KHu!@Lc+u4WMyF>VCK_n
zk-C5dBSHe8?jgr{ZDl1eH1y*5SitrDiBoe~Sy_5I5g}n(NC=|&JOol@JV=5b2*R%0
z=0WbLyj1^n?J30>0}T!2(a_)^If_Nk&`@l2v`C%3ogH9|L+$OW>+6O9UjXwaCid=F
z0m@ZAYJ74M@Tk1}d=8(??>;_<%WZ^MSpF&#OiWCm%Gv9c2nbTo&+S0-WFe!YLn`?t
zAubMPxuL#3eFbzx@QT*Pe!>LG8~Oz?Gqd8ykFOv<6B1?@7fUiSh-qk=zHRjJgSO95
zS5?*Ie#e%G50O_<@qgqmfB_BwD4Jb!-?5PqA~LehzP`DwwK;%jpuB5qYq#lvmf8?I
znE_feucV{}@Dw}j;I2$vSva3!jT&lDVq|2{R+2)_7rSqqn<^{6fvNv)c(}D?Vrd!D
z_#%vc9mR(a)N)06dHS+2<hi0nMMZfCj{g1?*zraOKaoTL>=|kMqN=Lu>e}@4r@XoO
zE>M4fZ3O!pe0U23;}6xwCkAud0*JrFp#(rBR4lC6*jUjfp1bt?{Fyd%ENpCWa^05|
zp}t?BjhvmF94t0rFNXg3A<*<q>{azi;C&1t1n2|-8WYe)qqXNi2ixWZtVUrIh}^N+
z*#@h5HoGe@*MOlxyll6ICa0&Pk`lqd03LazjCYXG)0bvv8v<<HK0I0LuBomD(gc9(
z2m_?r1lbA}+!mkf<vuE}%by^AHr=pS0BnKB@^KFp&?|XW)v}V3kB{t~yMT1Ub^U_8
z@J=r&u>!>+3c3-H4ETw|qgi6W<pA(9(pvZT?`IbL611(2jSDK(j~_PxSZ-eipb-9g
zTIg(QY#iagygFJ&gC3oprsQTM#SCo&5D*m`8wQk$g$1|!Eeiy4;M!!rhXo5Orl$7R
zhX`msK=|1|e0W<Ge1!n)E1*9Bo&l2u`i0(Xr4a=M#rN*KeOGV*m=(cTbV6b?Kw)Bn
zNV~@xP$yL$AqoDZQ7wgg{W{a#11KKQuuY##34spF1umAZZa5KeHxXfBVQ-)WaI#wK
z06zT)42+45C4q}fNui^q4Y?ErtU3u;F<>Bmn24#$Pp$0E&dz;XwwOkWLbF$~F)>{|
zJx)hUymr&Gvk56FEkJC`p_4@9=jWSRSj>-)e<$hzL<{&-0CoUz`SGKt(SC2_(X1LP
zK0cl!s1G=%0OBS`N4<%9Qn(#wW@r2R`>9CpD~=APN;Dtt>ey_Sic3mDyVua5LFxZm
z`69eyOwb63zKL8CTS-F$j^Ix~zk$nkzH#|*=V)xa86W=^T5xr3O<hfGL{*WHUZ?pz
zs9L@yevf-_x@k2qjesZ$@b@Qy`=qJKZPXVHq|IZCrJS4`GIE%Y&kJZlP{Y710;C{_
zNL*X{9Uyr?BcMV9Uqh$XG9fH%i0G9<$x$fVCLIqC{za0cloYV!(pW99@bHoV%7FF?
zCVAuuW~08Yvr`06Z&(xpBDmAlE(CzVz;;wnP~hWh1^GY<$MoWCYI4#@U%$PtkIQO~
zMNe-fODr5{tgKSlHf9@bZEfH-2*)s}l?$k<V!xx1CPS~Yn)mP0&&kOFGJbMmVuaG`
zb;y3{#`=0sUmwtrL4koX$s8x)=6Ch6on2ir@gD>Q1Oxy8M@5~1X;b>S*xif<=S!zq
z`}Ye6h$bjZU|b-@7v|;~&sCd&ECEDvU|;~S`t7waM$t{+QUN{~6dDRz*VGi&yLpz+
zHsgQQFBome(BT6*kT_oN9vLZ)b=Rx|Z2($7$VSt;y*LJ480naVgo6D1p5b8zK={eX
zL>{I5{rm#K>&Fk^4W1nzBVN;;viE_QU!0#iIXO|tCie^uK7gX;Lwht`0$nAmtZbRG
z99T`l!r#NgZGfKxD3>tKIUDUPKzpE+*Z%&MmzV#nr>9<L{qFWs7<GN7!hi!`RzU$R
zpd%t>^A>POM%ztkAk~0y0)JO-VMd5AR2aZAGgkpF3H+6U0yAyx#pQkq^gsZgpzVOB
z4k(dW7(OxK?c+le$Y$XCrl+R^)a7D3bPTaQ^CuEFFfaf;;a1(3NF20sadB~B;g?3x
zlGEKmL9b7zNeIHjfp-OHG5-`QG;xG@bOX3W;K@W1Vln}uwjF?;u)e+yP-EgEzp(Ii
zi3qSAe~PV<!hHg33t@ZG|GEV@4;T;s^j0LLkpOBoVEzEUIXRuzv5l@nu1i56K_P+H
zqiO;FLzyN>Vh}(lCypB|#9*|ZQUGB`$mGDh1z88k)Au|X2?_YuuSrmHbF*7oT0oip
z!1bpAT$TI%`~3R)c>GV`^qKJA%mqb7MWv+_N15Tb<@9$w1d)860?>khLxE)s;P%QZ
z^1=b_3Ydy}roDp$fMq{FzYNc`3i(vPrEqXKKR=3)fd26XbpvR`mwd!z=v2hS#C&`n
z+tY6KmCVe{pl9B?x@3hrrkR_10<R3P$&t9X<iOzsLlwR<C&=7yZf-t4{?g~jD(UFx
zXlG{!JT-ZI7b@LpLr@=g%iO1Xj?jH(#!K61ii+bP_p%(j`uh=ryF|sr##PW$mRR2>
zC5{wW`dY^b&nj&azI!X6tJUV=-WBu)+$8Wr!0TC2a(^QRn0i_UH8}qP!2WPgPkwRn
z{_bvMY;5jOvC_ZwBB+sOq@=clmq=g{jA+99(pFaJ-#Xat|9N5Q+(i~-)e8v7a;m&*
z-}|-n;H2?I?eCQZ@F{&Tc18GoxlH+|P(&!lxpUOdBYEK+LjQeGqtbkm8tB8aCC3dz
za<lhnOtjxlCF!uNv;Jdw4J>nYxDMhq4&~M7|AnSv+}-iHe|K<@;)Q&K8!aQVt;CkD
zxgpOMBN%f9+Zg-2qGyQzVi50nIqwhgNVDMo;Q%R%O^(sz!Mp3nl^A9FK{~6BLC{w+
zBZG#94h2^yCfO&aST9j*EX@lFO9@rJ8f~!t&NyMMLF!QCyo^?zK=eS%msXH?6(OEk
z6BIErGBf5)IHP)<mQ&DBlU*D%MeQ#G+gg=b89PNSDh}Q?Zl<}Sgs28d(KI#Y-+JI>
zIFEl&J2^Cve>%Ql3v)blMPjczJS4!0xzUS|_}=-I>zl#+Qt(3u!?IiJ!sEzD|LG$>
zkDyU$SLT5uVJkuDPz5O`tC)^Og>O5)HT+u29etEI3mY}xa{EWaK_jKjGs(n-FDL0G
z@hhK}M~baXu~9LFEr^A4ikpo$?@O8^bH^lYKT>Kswcbctd=&N(Cp+MD3RN9b(pvf?
z*}I!HjG{q<O91CXTd~;o?Ey^#0f+i)APy7t8I`BZU`aI=E7@UF3r7401D{hK9jfs&
z)x`o8`o-d@7QI2C7v9%bH%={QX?n$5SC4Ek-*4^~ME!tw4sNs`Ynbb(nU}K635;Uw
zQ>Lrq%&4TCy7S<e@|}&2-CVtc!sjmo3r0<E)+q*3IYU)k{67jR++@bcek6ZWjm3nh
zQp_Fl`*Y!GEBx^pxIwj=llaKykj6s%68i`5`72$%vwM8JdubV?(}TOR*b>h8H&6ay
zqR`9k*K-U+?E`9YQ$qfdmToXu{m0uwjNjm3x$%%L5F>^?GMZ3yPF!5{QgcR4+D1Oo
zq_9kG6rAzh6t&jvFRaGdv~H}&+|vt5Y8)~TGcca)xEGRwA=&V|U7-XW?hv2LDXS~1
z>L`p;$_rZiRIZAgT7NSi!aMr)Tk~U~1@4mj{o^b{bxzGYq=Nk75*t%2%)fN>CHuO<
z=4Mt;r>q0~?;D$%@|rVYtlUdV%}mX#<YWhLE_q+qlmQ6${>`X!bbp4>-(zNBj)#k@
z+0*nUrp7b_grdei6y)ww+V}6mYp~TL;LRRhq8#C3d(RK$Px0IJi<e7T?BA28)y|H0
zTxSC#@uultKn6aHYF5~>SzRE$3!nC&R*W^Xc+bQ*I_RjW?XD<d*H=J_+%Dv%M%%IU
z&LMJ@ME7NuY02L31Un1ioJCnK6Aq-@eVhpq##fAdTw_AMcYvP9er4sJQ}j7Nk)LdH
zcMB5Unn!ttgs5o*-!iLO^Dgg{yQSu!K|UW^c1YzpGBSSJIpRFkKjd|(dYT!OU_!5e
z$7!l7t#>k~+!#f6+d`jzh+w2j+&xX0^#cP)(_KczT73!OAZXm&y;M<^YRIkzeS721
zM3ulA`CGe5sxH1m+L;|UB|U{mLdJLm-n(nrvOH`!=+z+mfQNh18rC<p+}hERsCotl
zI}ArdFB$Sk*;$k}j#IR|$YX{HQxi?h4BIvg)eO<G`cVmDlJR$XyJoo=9ZW@5-SEWF
z^gPZ7XGUheZ44Z4NDWSNaR8vdbwi*;{XyJ{QqRGOMeAs&uaEf#?T1XyjAh&PCOIYK
z*R53^rFh^{YbD{}p#JD(P3_s`VR_THy@sc&?ZRCba=*aYhL4t(w2y>ZA|IjO=lr2C
z%%|?JEdP+z<TBTH7q(<+YPqtufqSxegSU##bY30P+tjEbnMco;9T<sryfb-$L*YVD
z_Fzh9m=AY+hMMFuub?^4wY!^ieN%7t-t0#*U!!01JKTtwMHzX;RB2&eVX%dv+o5BW
zxx3bI3yU&mb3|iEUP7`rm3#hpCF%;~)LdeatqI<k#%S8k4zb9e>{Px+a`rYd60VP0
zN~Tv+Z<H)J@thBkmS+>*JhAQ1uh}d)mMuM&$qmn*FqeYc;S4<TX<ZnWy8QF`S1j)q
z6eP{fv%aw<^zo6U^GZ<V9*;J5WgtgvU5TODt&UU2?P~S(1<l%W%-!{Dz!7mpZ$#>E
zJ{0}$Y1lDJm*G+I$t||a8*ztQnXUX1VaE$~78kGYNC61zwj9L|kJQyv&`RHlo6Y|a
z87mC!o}L@(4{?n`zZo;G;552mWFeuVd9<?9^YU)}Q^^V|gIHsWzU=4Wj*0X4al7CO
zl<{}n{2AYnyE^2%)}|POv&7)2wZDP$?j#cvY;f?Wk&dsfogt8yx1;g_yNNJ@-a5QK
zFPd2}nQ2uthEw%2#Vp=<c6qPWad4=`(I*Q6bS3Ab8aKTZt!hA?X5c8K?jO?_`pMxj
zr7SJ-CF(&RfGCUz2^Agb)9OVY)Ne+r9cQWi%Xupfe<HYN^y=Gf1Rz;Q*t(Zg(sH#u
zc)#IFi)Edc_`SDha6W9XzoCHx0{Jsub8fHP!VjxR;C{DqM{(CB<l_c`AUo8$H>r4D
zA{_j3Y<<GCWHg|@W&gDj&)?R%;^xGh*m)MZoH$6iaFp`4S$*ox_BiqtGleWFEJR!q
zZl1nspKxn2@2D>b1AB;7(DCX>C-nE{RyU+O?Nr7Sv#y+&0I*)vZDL|TPRN>;!7m{y
z<dsIr%FDpswq8EY1p@;E@yczyZ*QOgC<WUluV|+<!mMx9Wi@ZOxFrMy`5}T%`qAoL
zaMue+W0Nb)u){tPF<GA|il`_pXrGGMlDDhBPBe#kTAMu_&>l=iUnS%3Cr#5Wl^d^Q
zU^w~+ubI(p4g30%^EjUbR>B6;n_3*rG>&qe?`5gV4YfTSMIOy-f;w<}AO$dm0tnR$
zh=&N-I=QQuyJsZoEeRX(+U0}l1eN>xri`SJZ!;Y|#Y8uETV<oHhGcXz4ITO1xNv@v
z4Ru+5M7pUxKndcy4yP7`cn-nU!GK@C`%Cf*-(bHXJCgm0^^Qo^sHED^h64j5D?a}h
z(Hd-DpT%38ihM@VwXB2u_w7#zWyPg0+fwVRqfL#J7ytSgWxI0EcYXBJ(3qc}<9<*2
z<!eP<-Bnnnz}-Cur`eR*K-W!hYV9R5n+g^!7IFl>Fc)`|YBBdcC<q(EL)o-KK?}|f
z0a=wkhlHHH(2m&6@vu~fjGAPAuWrYZSvUQhi~Zu{%)cYoW2d{hxl?~foOoJ;r^X&3
z?AYBK%uS$+jcPL+EcQl`!CtXFVGHcRbF<AKpMHUYV6IcpmNVrhs-E>Mt4FWR@8Tui
zonw8m{mR0^q9_W1eEC{w`>m3d^@>7znD$4QlyDv40^jP2!)B=CuX&VZSgN@2xZttC
zteToEPkaLFA(x;r1*qtf2zS)KTg0e8*2&1w3rbr9-7rgvGALg`Oik}Mn(G@lArN3b
zi?0*`Q`mywR_ncr3ck@KD&&K6q>`Y*k$KfCzYWV`T6F#PHbw5K^U)|>c?x#*AQk;b
z{*kFAz1A<9T0+JmUU$`0=GC#xN(B6Os7XHsmz0$w>1dAj&@RhH=cdAbe_rx@UAz2k
z%%i`z3@QR=_SL1PtjZ*!%|7i#F9WTu>tjSE+ABc_LRf--q`yymJ`vs?Ln|5wKJ#R2
zjF`?qQ#gO9ypEumn&B926;-sD$=f}b=1bD$kUWAk)`@kedQKTc`}^;i8uK1=*&UQe
zS}hNRjdVT3MOg*aCP@diE!gi%qzH|NlP_1K;548}P=6fpA$>*|46$EBXJ)EG7+sWW
z<p+H{B+ufBe%MTL!q@NGx}jCXk&qv(G1O*@D9WV91?=5O5!%&N6l`S4{iyK9f_Mc-
z@v=YV!J3Jx2!1uQ3I7{@EK(V;g}|U2jH7sWUaT>c7^|y>l@=X%2u7R4-u=U$Pr-Ho
z$;YEJqhra{1<r;Kn!cJTBPRX{WyXp_MR6V#?JotTlw2CxQ_Cr*!!1Fh3=t~-?fuBO
z0s#;FAR78Q4Zzn&E48XGbC?(H)#1@Hv`-XtbO(ENDW^A=RRQEIB#dS06JrZrwS|vx
zSpA<_U2hQ>_^Xr3TV6h~5{*rW2XN(tFW54Xa2Zt`w1P%Itm~ov*=V6VbSwF+c4re0
z(YT~4nU#*3jgEqE@aW+k<d^>>LYk|`D7$8pp|`rEhPA24r}`&@K3~dkOOGci)28tp
z&=6o~9Rt3HPIG(mMzNZb*JWhnc-B~kHT(v!7DC5fQj-$U?byis<h1%%^i7UL8#gw7
z*v-k}nw09n)f2&BUr<qDsfoV+{?TGbUr>|l!7mNit&rjuR1#=qj!<KfnRgClFc4Zz
z9*dQrmS7bwi68LpH@LY(?B2<*gN;7K*LcjXCVUM{z)^oi-0I$f3dpMEZ25%wPN{tn
z7U--2zFh!%PPq$|KCftDXj>U&&Ei?b#={2uxjIL6u+kP3=3Pt5pP4|>6>@T}KbM~R
z`LfPNN3_E`TzLI7LEBgbSw+gx%!1nE^kH;l<n*$OTq78_&a|-Krc&3k>2dpZ5&<6C
zXZ|c4_Of_%@>6C_DZlN`x$JlKJ(GcXCilWWP2tNo*Y&m5)w!~xK<GG@62!B89319n
zqAa_<^=n}ZHpy6_M79!X%B2tw%n@yITiH;RS2Jc*d<raLo=fU#Z^ysA7)4-M&W=t7
z*?FYHO9Yq|NknQr7Y$Vp1Tv$h&ez<G6&F9c<eg{PNs6*(Y+wX`cX?gVy>;Ktl$YW_
zEm@hf;hx9T{`o_ywF$^6xNynMjg8yKF8iRc*Dj`c9p5Su_{deT#?K;~zYDFYeW~Zf
zf>~JogO6_s@oM4TudjC*+zQIUDwM1wbQc2Hl!9VYsl$QsPW*#sn8Nn77-6d~Kqy!b
z2RiCfqO0|S0%^XZ=0(uj@NK_?40)e|?pn4L^N{P$wW1)AXnv|Xe2{xEqXu>X-Ca(B
z=HFYuiBKE@NW<F7W_Cs@+VSzFf()y(M$udJ7xz4JS}AwXV}1cGVY#0)LZ)ry(pJa%
zkczP9=G4X(Z7bah{B984!Dp$jYzTBd8{y;Q4)+C}xjXjv2n0_B`FBE7Ra9@Q4(r}x
z#wi`zsae;#$);`gRlF2-pp%i4Q&muq2)@k3=Y4qc1=xR4K$FVP6ME_9h7?k*)m-)d
z0V4;yw}-=JZbQsz6{W2LmpwLBJ)Ox2XL0fW68{2qeHoV2XNt}miIt}egKKuo4IT0A
zzlV$s(^+k1t!aXHc%j=V)8w~=lOC69pT)4Rion?=#+57Oi|+>dtK*y;Gss8U>>4kX
z;s(<6(#sgB)&EK6ZGp3?engBGg|sst9@$s_d;#dNNk1U4dpMvI+5hQy;e4wlak5W_
z(yyNpBTHWqBeKeaJxvd`72_RYDE-z$Qy>R0pu~8z!TBK}5)$G=w~Mbc0V}sdGC~hj
zQu=h#R6qp!;Vt6FVVd`T2vEXcgeB><&P`X_q0>!Z18{_RBHOsfE}|=GaA+Gd9h4yB
zsfH8;QeyQEsL-{j%G>S+yEUA&>VoW5d@6F9r}S;TK+ts*rKhEn8iqQ*jUVMMT|QuC
z6gMG4MkW?0vY6VQPH~*v;vQn7+&&rmFfqs2m3aD6rmrA{(oGeijJPdYfqT1of>$2Y
z1f6@9##-#M<RxT%Gp>`u(D$yXM%gdS>7+NDk6%{T4Rim@YX-B*plWb6=I)W>HzT<^
z=STQn^-)1NO=^!2CtQ}iEOxht4h5vJd8?AxLs1J?!0Cm2no-i6!f7?1C-ELhiPfe~
zOIJf(8vF0o&M(Kzs7oehW;T}E>grm>(x%0c86R0Ej%dV#Sj}BC8s-ll_UCIfq?NpQ
zh}Q|tj($B&{OX*T0oXexYK99$UqEFKPN0a(;myYOBhq+oKI4ZEmTFb<x^f$bFB#6a
zYeW0#bv$3hMFOJ-byt}8Oq48=puSDB@)sfm6(6_lYc~$$m;SsvJ&NwKqpL+jwqwPS
zfyrTp*pqmgea=O-GRPVK{4G~;+Q#->pN+86@a0AxH{NL%_M7dMD93jWX-@_e2k%lq
zfT9e^dWeB+gYV_XQwYtosHv{Kdv(8O!2}Yl=lZ+LCLJ+nQAWi_In7s)58VrYtEQ14
z0+z>TS$EUnv>m0Thnul%eN^~>eVgwk`o{z17*V&S3=t5T2^Mxe5X<%co}w)8#ZJP7
zvq&BxUID*@gv39+2UfX9##M6XRz;t^CSu_hy+xoNn|ZX{nzgcGh#OmRH`*E1XkW%)
zH~jhh0fa>^xhcF$UpdB*clnx?weB_iIlUh>C5?<u9PD$}D}jNQR229{8L+6P8yo!w
zo?vaAW2EpL_wfUEw;o!uf7pAlVzOTvj{6bnJ?y*bZ(l3<FRu6R=-e(|(@?=jOoZ^9
zD#*&R^=-&K+h0QO@=cZibLcIKzYGv3i>#aXAPhxEgdycts7%if!H@h$bQ<NVfu<5C
zgNFS2HDUu*$A{~oZtW_L3szCyT~E@K^w7K=RIgP0SmD^(2UgV^HOc=OUQiQzu2Hd=
zR_504M%usHevOWq{(GVgwpz8)kkRV>a2{f0cUpV*qmd#uX4>~7`Gv0PT#rD1r~G`U
zyzB8sM12E?WPjm^i;gSoqgbar>wr%6+OL!OZ3klqJ%KMF7nup`Sw?Rrfng&&@cMaj
zvR+*b%q#IKJFrb{&p`3u(V5^7!RfQOt}S&;*M#nm_{n*8yVp6srBBJ-RDt|@Tr8p~
z+8e;O=JuGHG#sMnSBGJi5#yk&oZR18T!dEo9yUs?pyw?}vh4iVZ)FT*_e%&eSE>8?
zglo~7-*-yuq=DJ}a%=cTQ+!DgyPyV~#o{<PFpM>rT~^uEF9QI``EMQt=zdm5Z)2#M
zSkogzys~}22+bf2PwQ%G%o8qh{^=N0P}W9)j3`BUI^Dg8c*VTB>YQ{wC6j=71!i1=
zv+P<_i3M9bKZFC47aT+rwy#-d-Y{8C)+-*&`_+E)v&k!nxKt2@@59yUU>!hh0Hqu=
zvm;+^xHN)F*-@Y%95d=^^;~kvM97eMvrKSxobCZcuo@pvfV2JhhZ|K8Rj#he_)p{d
zJz#;rayA)GWTVDLeV$L@fI@^-d^;@*^O~F0f)rZX0g$y8DnsOiiDaaLoI(N;dL$5m
zPeM+Ur+`3HP>6mryy+b~#e8RZ@3sJPSu9t~D=pwt$GXI)Ze+`C3^9#QN*bRqA-@I&
zM8b-R*aZcxUvP%SHu}Ui3O(Hu@?I>}n$2!=P5iS>-^<%ip#z#Eg%!W(d>oYMHx>k%
zomYDNLzC*b#y_m`^qo_r*5#{8@_Qw=A!H_`r_gm4kT)`=`uxB0IOb(jgT23Nfh$Cf
zEr}tGSYt*91#au{=|vp#e{_y?=kNb}EBWsO+0(X_H8pP_@;d5ZVPA-w_vea&jyl5j
zajD?!gs-g8%SL5nBSjpg8wk;p0a!emh=B$ZhuX6Zwsrg8OWGV*f#qJX4ic`r9GAf1
z?eLnoBda{!wB^FMvB^T7jWt7Y*6W{R4x_xp-#p#?_1m`>5Z{l!o#b8lc?hS^Mm&V8
z`9D8NxUKOVpZ{M^Sp4TR7(H-<^Rdd@?LMTp0ckUiyK70q9fP4G!fG|9+AiPz5Pc<B
zaB<r@ZFyYL8D382b@{q&6oVC4l3e#kdVY1qs%~W$-vW-;cM&fo2tmq}E<5gtz_oG7
zN!;R12L(H~P09rYARzEkbSg@!QzIH(<qhj@7vq|Q;2@jevQXFouY|=LVG>^Qvyq`J
zcMovLw8TQ(H$cQ%v&v#J(>@O+rKsd_5L(0F7u=wxo3y^WmHX3i^?JzZSkTI*sE|10
zO2yzH9jW{$l~1WLv8?CUDAv5CMJ4>sH#7S;o22ip_qPuZ9p+p{t~#c}F1-WZkNK9R
zJ;Y36nw99!_(J>2CesT48!Js&PJ^<V(huOhh7QB>v(mkJ9^-NwOh1)uF0jw#byRK+
zHv$ez=`|2;m{*7+@Zz|oBa=+)JU%*zh8^73q@*ZC4qk7)e}YMJC?$+>1N|=V?KW<r
z3}MWIXs~a5XB6yEZEf|KMh`=EU5!bu@viK6PR&m}j9Xr}#aPyA6-G-BHxy=;6w&{h
zyYPerbq&!VFdi8-`D=eG3ov5w_eE5xZT2vWV5drLMRirJrat4*@~tT0&nt!&q`_Qn
zg4tJ)k)&K^<x!4PvFL&Z#y3F(&dZt_-~RBdb)5<op2@^^`gUK2A5X14NNY<u6DteX
z;D9L;MTJHRhY#20>y}jz(U$YLy|AqVPZ}5%MrMw^%9YYCT{?=I|F}g%iIV)W&b|3z
z;9{!}p`cW|GNUllMZmdX*4ZM?QXXqsDQ;}G!NW#d<{jkoL*<~J;EL|jU80$0bxp0p
z9~(3C*($TNKhV4-fw$}oOX^LA9am+)Azt6P!<L#JKM$S1nv2S%pv?ZueGCFW9+5)L
zklpAMD`RNNl6Zf7xK>BMNWN9;F>745WdnhPYp3EnayftLRxtW1<Nhiu3H#`rQx_Bb
zU#JN#pX-xNO`K>zSH4A2Rrz1wKRVe}J7$yT?BR@LC?g}@P4hMT3z>etwo`oh9BAj+
z8#P)LmrRxtKg6Vfx`o}nU6fS2%$lFi+wDdd&Z~@&m%U*DP<}G|=o67t2j=?<e-14I
zjcHQcGwPt0&v<iOGYy3JkgJ@Ri~Ib15(DL^ezEBh0+EkbtzP=`yK{kqo?$g?-X7&D
zj$uJ3Qt`QtbgE?)RSkj-mcyd{+~)e<!axMv?0m*!Us|xw`{lt7Yx8WKH8DKYu(Gn1
z=+w(9X_0-HI&%NJx0$<HBgKvm-+5LiYautvlw@_fFrN~d63Wz@I(R1)B&mCIe|0(E
zU!U*WGh%u!J25%cq~rO)CU7#fgL?y$f^fl^g^6Wua8J<AA-|yE<j)(;F-0aQ2$DV0
zVCdlbLSd2loW?v9q-3z9?8&ve?+<R&JBRs)#=NGur2eQ!pW#KG4JX6tHe+#+vw7Wp
zB1&o@jj_~|k3w0mT3@CmM)Mgp5<E}k$dR7E@wv%TyU+F*^|aC?PyI#DNe?};EV70L
z4mzG=sg1SyQp3>6(1z<?tW|1VXOKC{4e%0gtuBqLWNuzvBd6M-qhsx_DV(l9GAB(+
zZPC@Nu52(S7*)HMeQ(-hMh`r{ZL5<ke2v`1UTIsthqIm9;1S8=ZleBmcK72R-DU!}
zE?3a+&mO9W{dr{|GfJv;>4|#UCLB(@CeKiG(%!y(E30ddg@%#Jp}x7>g7B*06|r}Q
z5Qz|NF)jZ7{%WK?B4W#yj+Q(=7N*DHA1Y^ZPMlfz=_B2Ceb;dF9W(3Ms$T1dpxi&3
zWoh?vS{#|VIei`dRR+r{GVd<7P;?PJGDNyQhNbX#4*f3WR(2g9x=594zU|JyJUc?g
z!jz4R2|Ti+;-;n*l~MG2EM{tDCTEmMqc8bZkM>SuBk$@7105}~Zsr*XB*nyFDJf0#
zJQS=QbX+xIVJTrF1hRpA_ADyQr^izq6n&d0!=c53P&>mB6J<g6;AS&~zbQ`<<-
zUnijO(Hq~I(J?l`55Cd;cH|~;7xydt=<oi)VLEyF*rOOhg+7L_N|<jGJT7Tp!@>WM
z`0-{P>z;l%xtNDG)_b9smWYgqmRg99nv_O(2utv%?vRue-P6_Vqg=-8Sr@88>gbqg
z#S)VXoA-ST6zCFB>l6U&%Ha`Sg9L$2(b(87f2)jSq{5~GTxNol6m5wUs#|}>=h8%l
z?_?)9vYJvez^SO7Vp4LLIZF5-acgiqpA=r&igNz?_~G&IrNJ&H@3*tdwo<TP2S_pL
z0qJNl#<z$OGxNQ^i56gpK>h!O;!1-+ir9*ITn<NSs@P$#Lb~-UMD)u}OnxDLc@1nm
zk^7C8-r{8X9=gKq`W0m8VEtPd8Eh}1u!$NZGtUvvUSg(1hev!WGQDgddWj{ny#rYL
z=gzU;SvPC0@fkP|HPTqvm>6)->NgT#G0C#srya>;BuyC^;dPp903inGNPWAK?k?{i
z-80(_5|a{3ur$Ct$eZQc&<@aORO%DY5m6HL@Q;ttx)AAxR{XWmk{TNg1&F@xJHdCj
z_PX!%tHR0p;!F<*pNEczpocSg!$hauw`)I)IQDsK;kvve99mtglFSS@r|Al)2L}0Q
z<u@1xQ9Vsmfdvn1@AKbh=NSGyN!hkmmvA<6j!E{}KRQ(!|J22#PwIJpj?p{RBdD!#
zOO(ff6Kr&S(Y$K(#t;eup17o=Ygx>TF^XB|Fq1GdluAR5%1)=1tBreN%2?hR8(aZy
z)xJCxJ?--&Mzl}!V&`e(EbkY7Ag4uvB0ryD!ofqMq+yp((pn!@ctV<)nFTvdX=n<2
zh+{Tl78bPrdE0{zOXG4jt%ljghjV{B2L^aNceMJlK*PMcyuF^;kdRQ=(G0)3vf-Os
zt;hNPn^w1~xW0V7J+S{(gWr-4p)TJ^ru2{BpNF)#6B6Semr28yW16P)?9KJ(!0e1i
zF&{3wtsh~`@DDY2*@CB39BgKiA`%l&qv~y2vixi3Sk{)OK0-s^sM=I7)*bvz>K_Mm
z<(X>y8u|bw_}E2*mBRCdVp~((L-u#?OOrF(*YdD-Z;&<wmeawg0WCSyIucipqhl!H
zC|~;m_z{4V#Uoz(nHIf~y<54F$;p_vNh~o-iPn(CMo!<cJY{-1`VPmxJ72fe-n(DY
z<mDN@AD!e-!opg3?V8k|SL%6({xdGc6A&!I<qj2_$}HI((qd85Ju?mZa;Jln(&Bn)
z3FVVEl*4?y@SsNYU21)E>;2R7sqdQMx;&SdnCKkt8=cKLDi8=f)ydY<A^XtsT={}`
zaME69PL@hAEj%wY?@R58Y`J(c_wDo6bQ|JeRoZMjq(lz&+@>P%c*6O^nuUpt`yGCI
zW~tluL~Eu`YiI&~GCo0Cst&)SMZg0Ag)xPq?b?B?ro0UrXIJk6&r3J%eeT2e<fL!T
zS>wS|3BYTzz_R)LhY=VNG3%XVE^RTgI$RD<n^{G9Hh{4L&eC#vgL05kxm?*;TKhPa
z(=@<g+=N7<JQ35YtA;yg{=(;;dz$<7^h~GObx8lPcv)K+?6Bi~x#*ekn~apzB5_p7
z<?A4cFc~UMA9Wt??l^#z-rfOXN1<`F-V-WYtRcYA^b|F<`~M74l#z*95O;`*NQ=ls
z4fzifesMs@M6Y-6&1oso){<N}m~?;VPPdC0N{k{V_0Q$H!M{x<&=h|}cY}QdZK#NS
zHUmRcDOx$+T)i3L`1TH-+d8@J9Ujir>e~3|PfiFRJGMyJqyJ+Y33!abXhW9=X(t`y
z$LsRS3YeP!!(5{hd$q;!@imvVjNGus$intC#pm5C>uJ!5b!j{v_RAN4zH`SNt*FI^
z8cXZ8*&h%Y9QlW37QR@#kp_NK4+jK%?nh8?a<iM;MAf_Oynnl347%Gm)M1`gHIXO$
ze^Pq5gTeWSq;S6GM;2;zfZ&JoE~bD=58cU5D}_!*c6a%a2ROddtTA0w5Zm%u5Mn9M
zX#JXUk(+v$`g{C&aK2n4GtC>uTP+vosa2mbMg6CgCTHl>z~q`Al{Xevx@<JFZU?bt
zNvTWRd<uROv!Psj+3$SgWVtzGoMZA5a((T*5woFX!-s!?{_H9+y)d<0vevb$sHxjw
zes|X6w0f(hC3(NTv1W`#QMXu%0p|<Q|A}v&uYTE;nyBY#J7*7&ch~Skooct0NoMvD
z;Y;MAw0gcgt&4xRU+ojSS<+lh^6RK+a+P!2x-&fgBXcZOpqZ)61b_3MnyaR(ko2mw
zcK>Cq14}y?BDSoSXx;Z#KiVWKsRT{6FVN1Ff)vKNW}>%f{F%4d8d~zr=A&Gp@Ggbu
zb9kzjR8-VfR%S^B771_TLPR;5@2=;_`SbjW-xslr*QO`ya8lC9dER24nq$tn;Ew85
zY=`s$zF?k6U6$I06Ke7DS&X%TG1ATO8?KV!KnZ$4AQ=JW79Bc8U9pU_dZ4&O{r><e
zr1nv`A)}R%ixvj*Gbtr5Hu~|v7Lqu%OertB6BnpO(@)dY={g;Rfref1zGPc>n=wx@
zQ=i29tH*L?o1Nno#}W5nZx``<+G|oGOo#v(YMc9IIC8jEz6~_Q7ux^Zx38d$|BSNz
z@=gA7cXDb1Jl+n7WJZ%>?w<P-E)O;KKNHFGCEcK=x&CeEQ%Nk?0=>BOyf*cPBiX02
z`EC!^VNVros+VJDg+mAfSB3rfHW5cDRy8IjniLKl9hGZ*EQF~})(tx>$1%qM9~taL
zooyTOZ-jwB5D)-gT@UKNS;G#4K#T|2;1R&TWL=bJrzf)UWTA5te{G1Fx9&d$HB5%2
z+x!Mf;RWOylWp3PCo+WS{9bF(s?x_&;~SSy!|A^*Q>IX0pxRcv420~BgMWr<2S-=>
z3icYm!6aGLk#|y@oSwXR-GPCHZ)VM^s3SqoMN2_P)sod>P&mraUsGPb`hJ@9VCN8Q
zHHU!|S1nbWk2Wd?Pt7tc!a-)7KhM`OK!Hm+AWV5Uoh5{)VM9I}>m{ZpFHOu~V&OAR
zng)fnI@qa6o0zC7nVyHt=0@kxR;rg9pB1`mq5v&qt#<gi>JKhNAk`r_JgqLTDKDqM
zXrL75xk&w?=?3vgNn-`JwG~~HqJ+gGpCXuH!4#jhjyJJ$f$nRJ3nmoQ>YtB)WLU@2
zjI#;fU}4}NcZ^taAqQj)$fzmR*EYsBYicUe&{6|4Gzv57=kp(oD9o3Tc+EjXIfTa5
zZ-SDdyDM~c<eU9XO-+=tu@T8cI5(tRBplxgGai*Xs1#Nh6*5NeeeNX--La84{_bPO
zyt(9k+`FnT18KF;yr8v^yiI&=w{4o@1WXg0vXQZ7_}2)TGJX&*Ah=tQVL3QBiw|NK
zzBM@<M=Ne$eSkpN8R{-EOPW-*U-?3lWq0P`$^;^YBH!Zj>`li4+v*Z?aHc11&lDsJ
z1{!;p05;^yX5!h6rpeGUm+j@Pgb>O~ewM9*U~jTt)$k(6#mIP0c7AnzIW*)$NBhLl
zc-{HL0M+=#IZ_@bcC=G{Tk>dGv?BxmbLMvqQqfcW>FL})!Z7{J6XRB$QJNYP8gm#c
z0o<FPC$MybNx7w>E|M^fgOtCp-(cA~U7eh-hX5rmXXAd3Xg4a%Oe^iy`~a>Ef!O8l
zvaqrlnQY<?vp_?DMlRK;MlPwIwNk@*gXIo-84$v@Vhn7oEOb?xV|gN?6o1B6D>3e5
zyL`I==(T-ur6QrAr6UDyagx>Tfvs2Z0vVuh;18g0KfUB9eoReHlz?j<ghs_AZC@-W
zZH4nl21x>CjdF|IY&1;q?a{0kN23wfz-<;_F!ZU3so7G93GUQ1E9T}KD2ST6IzijF
zgHFdowYsgpVOfACsvc>JWDS<Hj3*RYF*;J<TJ7|8u@x0J+?Leys_R^4Y_))X4|I7?
zZ=bE}gI7{qlV^siHiK+znw{ZrU`dhV>Pd>+GM{MEHj6!LZj!8~Av0lUt<%j5;i{dv
z+d22hBC`S=&o8s{duwy#c%P!iQeB(+sV=#`i28L6l(W2g_OLTptF=%AbiHhr^otyU
zCMH-!f&ek$_;4iqWEwe!KD^u%;h4hkj@$+)GrlKXaumpi$uE6M9lG~2x1>Zpo{v)r
zk1Jpsm+-Lb`r>5LJ|IDN2_oWNpS-$wY=fOQA8dODhiK?&)$1)E|Gs#h8KGCEJQlrE
zElK<>4`WkvqMA5aK281OGThTuw&nlj1qzs2EoD=PThFu(&0E;zD&-Py<h-%3xwu5O
zFs1v~HX-*2MfUS0x`Gz)Szh(oGcQrFHHy5Ry*fG5HX`JIpUFc0AFcKO^}snYqN$bD
zD~OWnWO+r!s4R-16L|ZdW?gQge&CL-z+*SqU|&E3|If7>B?gbE+{7%+&4b@@0XD}D
z@$k{o(@6>k{?t>@(Q>wX;Qasa2Cx0DeY;*2{PPA^n;`H0d*AJUExmoGK0>jh`{zLZ
zKg(kO?PdPo>tz4qfT-z`Pn)l5tkM-qIK1rp0eT#3Yp~<^7`dR#44+Ka#Z5(0TJ-6-
zIi&r1WIZG9BUq1jb!+lGlvoJQ27aL8*lHD@tJ~cpV|8nz_mqZ<M2Pf-C!r9qwQ>tu
zH&*;ifY*hds&!UebQ?h6MT~yW$#W@2QlgBQl$jhk9nLqV(e)w!eZi^oo8t!qQz=E7
zk{ZOo)6Bl|ve9T_=ogIC9}a`_JYC$T)Z+Mr&K!=aU5!7<J{@B_E{ni+p|G=q-#u_J
z(h_-JDY)w9j&a^~rulRX&ecKTM{FrpEj?N+Z8o*Cri>8pnB){CB3s`t-J`72k_;U@
zc79X7q^~-0rKj7$xH_Liq#^YdJQmq&|1_rBj%rqDb8_)|9b-KMxOQs~0|B8aG0%Hw
z^L3TPG-1K;-{_Y8lCj12ycU7>gx9rcrQxJbnF@LoaJ9I*;bre=6^#&XY;r*^<9)d9
z3m(W${$FKWWmHt*7QIL#($a|1(hS{Q(hZWLfaD-44GJP5%}95HAl)F{AssSEH%L3=
zyW?B$*PAszW=-CE?wRxLv%h_IyfIgEnhTW<N@PIP3H1A`cA5`Q12Q$VoQi`G39V#z
z*%bYB4_?2PB|2<rXL&}C0latHX%`*EFHCy-ik!RV@u!%W?*6GSm<E&jy;z?gs!e2+
zt0TF?XJSUC=E*23Hl`ROk2n$;r*SCbZ8_e|F$5QuEki*`m~2jV-42}MlE3+@F*FGu
z_<JF+eR8)L-M-Xa2~uT_7F9YY<xkv-S&HLgcG6Kds{3Y}c0c)Ka)MjnJziF#q%_=B
zDFr#-xD_ZBoZup&BH~A_rL5mbV@Wcgf~o1%N9<{}dgtb1w^a62^TP8sT)goq3_+hf
zYgYWs&=WSlt{6J=l2Q(o4rmvE<6v+NKLnbW>Xup|u7*ZG-5J3nn>|!u*Pc>R;?l7M
z3Ny%5rNcA9rYHD}a`cgx*cBvfYH+7;ISL;DZSSwXt|wM~uWV_!?)C0^U`<VI754}F
zlcxh)Rml3Zhe@~?QPLDX5zqKhA_P0)A(MPQ{}dmmsie7_HVePNS}635r%ioBWYYAB
z(RLVS0QT1JH?uqwQq@<hJCa(`FST?yg;`s_<$uBMqrT%v2}`!U(AF{92=mK}`<&Ov
z#-X{00gPvq-;Lh7BIy{iYl5R5^4NG7*VAD|&qVCtU<QS}a?h%NG%qcz%&|!Nzt>0J
zF&sh~0Y1I*b7JnLnQpNXv%au6Co$*|q6gbP91iK_EL@7qoj6ymdCTS?74E8XIiF$J
zV686eNSqxig@=Df^oq}h(k?FFic!}+QO1wHZ?xshjp*_5<LIi+@PU%}&Nc-aL%*uR
z=FpS3@66_B5Z5XvL%MDxi^hwdoWjBS`J%_BJ}}xZxRH9}F<lVIezkb4!p{dP3`AT)
z@T&_>B~$Z!jd!sq??B~BWx4I_gsj(2$cbH**7BNf>w_zMdO~Gp0n<O;e_!R~L>ATb
zr^iPhWq6DI-Dz?23pme;$SMlV!fHQ-sT|IVMYw<tA<L2>X2Q;bOhVaZMMPbJi2dV(
zx!%)#YKN2Ec>0^y7jx7g!;N`+^+dwQUzQ3QpF4S&{w}~5A?d|$q@b`5Ph`Hjd`dYw
zvn+LtO>s%DrKFm5zxCie3}0milGQGv>Ew-)`4qpMn-*jp@&5(>?c@r2H25-4p5#0>
z;q%?I*NNC@bO12)_w);>9Rx6R%uM8UwAw=qVzz>g&fV4P413Kvp+?e;>vB9N<qeT<
zZT%5*1LR%@j=Q)tg%RNVB9lJG1+QoDm3;DuAOo$R$68{`Lf)EZ8aUZVn#v8~Y07YK
zzuXD0FnHQV6L3A^Wld@gNHMpG3PaS)+=;-s@u#K>o`pFLC##dhVmBu3Y?rshwCYbr
zd86@OlXJs{tPfv$<OQUC4xFp`Th?a|#rgaP1xVoAxTp}ZZFKlF9d3lZ^1S7PSluE_
zyB7*|yllC2y5Bg!*wg`lK@ptmgR}v~WM|b`F%4b|ia+2yyn(m?jfuDof*tq~iT)||
z+eKTziPiL|)khN+U!Se?OERy&jY#*FV+2RV(7o_Ir^|kHcl5pAQh=8zhU^-M;G}#F
zPf|<;klPchU2!{=RuHb-n9xj2Kg%5~G-;wvotb?jb+F-PlHfCn@XsLjayOz2Bl&&X
zWf}Gyc-k$P(_GzLof|*Xgz+gNG(IA<TH|Su)PA>TrR$l0Lv>|!U9{EmR=+(|$7VN&
zirQrVMaqfaFN=w_>VlC8`XkDh>KvnN_O^B-O8qnA2J((nfW_>NTHRv6%M?tqj89hz
z3!>%J-E}Acgs8f<ur&8pn;avuE!pE<Xy-Wjq$Iu{iu0I8-V7yUYHnF%_YLC)p*L1j
zuVg8Lbphv>Z!2X2-}?Ar%*`;JuOc&|#hJa~dBH+4D#BGeCnxvw0yyfQQE6?hJ$gi`
zE9*-)y5CS|9sOt_;f3+S%LSE_!Ni(%WMS<NWwh6r%>p?HVzQJxanmozWx-$JnIP@g
zEG$W6LQb(xYUS&wzUcPu`e58gOTFj@pI=$s3#FCI52i@o?)7>A1X@}Au;9kpcF&d~
zc%$0BrsxEp6Y{3XH=J`eO<zh-Jf=xc$j~iwU<U#@eaYHr#ySO@%~_~0Q?oawz)-k*
zp5v0E<X7hsk30%VcMZfV8I9tcF~4mvoHuIv(|$YA;pt{c&aFD!I)?MmzCy`GKm8yb
z2s9s!NUuGeCBDZFA;Y!erGLm4CC6}nhU@$x;}UKv?(Dh8bDh7pb+69MuZ*jqHLaIb
zagPsVeamkZs@FA2y(@E(d5<IR(b5x|KmOXZ@o8yNe5$ObZXn23cPxkxaY=}mXGwT1
zu($4Ujs0FczL9AG2QzTFamm%4qUtt1T$GGeM9HAbFW@W?ms5QmEk!sdR|!chL5>(J
z%b>ZLi84h{Y*=h)W~zBoML`RU`pE_V!YN$T&F9=H^?-S%zTQKu8DSUWBF_-a`oh~d
z&7Rfnx{r`xgNtjIqqVfUw7kUZ_a$u-5hRC!uDC4g!IC4?k=A)XHY*9VM2Mu@s`}tk
z6V?ACaA)W6K-#ek$^<n?JTGT3HZ$pA?w?$q-#Om6hl}00opeY?<Xg8BD_gB@SK}p+
z$Cl*eDtYQRw$`??6u#NNn(M<8am0x#tZ#Dhv2PwL4GhB}cBtpzH@AFgFgdQfOjjrp
zHq-Q}PY!)2Ck-C-6WyqX_%TzY={tHsD7XM>R)j;LA77jdMt;L(iB1_!Jv@bE<uu@c
zQh?>v^WR-=5zj%oTLGp+OsCCDRE6A_aU64M+s!d735`#EJ*4GxWFvu!S0bC-I4<-{
z5@RHrp`CsFTJc^;0XAvJYS;e$3KbwAI@#}Kqkb5*KQot!?V|nS<%y>TxLa!X*6D5I
zyEBwnrF~kpm-?1*tHCyc=I)*zr~tSIz5YO~x}}Z@V1?u4b?1Qr2LOrIzY7y{qT3Kr
zMy?xUKG0^S;P^J4i<gY@azFsWeerG4CY0X3Cr_?q$o}W@5hZuAf{bC6z%AcWOjx1D
zd>Ena;KG@KffMuq1ef|hvlUBKZQPMVwU?z&{3}pV0P!Vp{87otaf4^g#*+^VP9LF4
z2IuH}=niJs%qL6X5};^aRk-s^;WnD^-&7|RF!52jJ_N7I9oUbisy(zHw5QoTd`z$c
zEwY~w&|LQLr2(XTnwk#k@<aE>{yaPu5UeYC%T_kmlmuA;Bda4}Yc9F(y>;6o5p~US
z#p?!}8xwEXObEmZ@Uah#gZU~_wP{cfgjq(B)xpJgE>7-zWblI$zib2#wcVMfCr~%l
z?G1J`m#Wo6){-_CN345!c(`#buJM=V{5-b!jp;G*I4^pyHuahv#w&3i8jOYEg_}Dz
zJ?iUZXJpG3)=Ny2lhfWlP9y`*+1|+^Swz-zV}{Z1<jji#02=)NKC${;{gjN-n}Wpp
z>-Vbai{jWx^y60pLxNo3T-PgN+LSz51E_IAUJ)Oonitb1%3a|-gvRmjhcY^_^dS}f
z!|cP&>d-d-zZn$<7Ot{G!2@xa6-NcNMXz{WgS0>(4MV7R63OW0^BYH35XX}u!}}I~
z_}+YdbHfhm$i-2VRK_lQm@cwXMcdbj3s}6d{N?I2w7WDk6v5iV{8_t>aM8g`$aACY
zX4q0f`pMZIsx-l6;9|=IvA4mu26ckd^=HpxT?{Wv(uYXP>evULC+fwa0B-Jh-*rpF
z5<6-rnh?qI><;Y2Oe%4qN=HJCu_%l6yzx>`1*Qbj{og0aG4MbveZ0NlO#z#m8)(43
zczfbRy`Jie_43R7&iy!b-z_xZ=Pq1XMG1d$D+Es`9vaRY<}s@XOxMRa^0q%c628oB
zv_*eF=Zji*ek8FcCCrBwQkOgO9=##@mrGuDV5~F)5x&XNb2{Y!{dv3mMi!bum|k2m
zDANaVJryoSnyaYD@7dG#VK7}5CZ|uT!x89S^7+BG95za2AiztDLhTBi_p?3?=ZGjB
zvS;pB#xvXa>jOqnQ}MYqHU2zP_L<l7(}%}+bBKY_f%1YvIe3o+cfwF62&r!kWRE6M
z2YR}cM}sB?LWn@rAg|Ua5!CKFB-U_IbN&f2L_<o5)lKFJ2Pr*O{bVfQsi?R%9X;2y
zWK1k!DOg~CXp<+CU$Z?s#lOHm{o{)Ne)V3+kRA4<xdH&TLn)Vj-UqxO19K<m-l^$h
zVtQg*#vc@MMP%!Ex;}1Xd{5rd|9Q#Jx_ZdAeimPn085TgIeX1B-EhYlPaG+q$wKNf
zyXLPZISQt9r+6SdQI_s@{4{tGpOCr}fDhVwW=oG$(O})m*to_cm6B)>lK*v<!JwS(
zugd#{BF?dwI$C5m!fB9TIffoIX5Wh&7?IkSkUO~K`OclYLiunvb*2Hz${o)e#ryJ6
z&U!X3f(e!~T}&jSnk~zCs;{TN_U`iF&4$*g|6#Xh`m!tiwhp;)u+-Bh%f21wzv9dC
ze!2QMnFu?3)(lb-RD{w#F?$aVdE3LCs9OUpMv8;KuY)HjH0&zF%+Xc(8|=p<Zni`V
zERhF1fytUI2>xW5ep2*IXBC)|QA5!i1!S5S{ytQ?2|6y<2k0OSbZQZyy5LB0L8;Tq
zYpq?2I?a_JeyxDnbKQ<FvE_fZeb6e{GFr0D0vAWPh6tyQJQO#D*9z2s2_Z*D5*Leq
z*w{+hT~%pyI7HImV<~5apZD6vdDo@Mn8bkk04nf?;Z4BJ7{2%mx;QZtc-&Unu<D?6
zp%J}~o_@8>JR!e0quUZaMADZIcepTX@-#vT$Cj!n6d`|3Cd)tsHdrQ-tKPStOYg4<
zK<i6rUwgxKLEF_9_24V>T)$R-*j(6=_q)qS8)$JrV7xKsh7IX~M0GNR`#D6<Ob;fu
zI^i-O*)X05D*!i@DuC#!s0Sjaq)0xb2o*f?&l<rkCG1TukJ@r4Z8wfn!bSAN8G*pb
zLE<7tALkVcJ6y1LylcLtOn*cH{J9(W8rNbDbKO{7al7@MUg=+uaNSK<|Ei)+#C-Ik
z|1%7?2Cr;yo|!iD8teuK`&GmnHxb*buKFG`Qb)A0LGRw8f!E_OGa#H(B7-FMs%J7Y
zKJ_CcR4r8BJiuMSB}d2#M3Z@G%@ILCDAELOo0>piRCe#BMs`=j8zeo25G~?4<lq;0
zozppHR0giEU|T8p#IUDpOx!iQ5-ji<g3@}5u@Ry4!jO@17dx5v;Yl@0FH}VUkPBi`
z$a~VfCYMY_g}E14?d)FTu36sQ+uJe2GcNKyjIdaL$Y!K685@X6VbFT12|DXomLw8n
zh<WCF$t&yGyNa?=saR{5r(|fmId_uiIG*?>vQ!PCs&3_GNg(!8vpV;Q+}b$rFF^s!
zXjEqWt3lR*gvYvkf;F1uaX83?ulz{4`3(dxt_x!nXJARe3CD3T{T>irTzBDi?o}r#
zy6*O+7DV8hr`E0SIu3~O!D_r$Lis&XHeH?oG}SzeQ<5Z0K-}?=EP-lLUQ+TMO8X}B
zbEQbtY5s{R-k%c?O;lhzbthxNAN=bw1s7)N7!}ABmhQ6l-YWg_@}AUuAy@~&I=fq`
zIcccN_BPc+jRpjPz}THa25BvSG-<oi-Ryr-xm~G8pq0Y>7pog1f+%#)y%FK|+2u8v
z_?u<S>hG%;=JUliWx~^{TOj9fF>~?j>;v)T1;PuRNwM%NnZ7E|Ps<;8#psC<U#n}6
zM82B<z?jdGrl22aMEN@CX8~&IWB&Edc6rHW#4h*+!UDYYqEO8ltsj;Y5aMd$&N%+U
z;%8Vyn`HTtvW2;=I<n?gEcv_;OASd8k~@xqnyg<AZL#Zel|?^<{IfIJ7P1atOQ_|0
zkB}ke38dWoVh~t(5a9pOUGGs(W4=Ht*@A=F+`=yp+hs?Mul3%&DJd8wBWmA3I)hmi
zw!XSDl~fW(ov0t9*8(5#_?+K=(}d@s<uSm}gAD|Rw#^O%ffN-FoxERoRqTc?QSAEK
z<KHfWj+M72Jhp^1^j{iim5VErFQV881I3!omhm&OXk^cym0RhV=_V&9$eNY#{SXZf
z&5RK1A#1u5d6|!tB4Rzx`eYwA#c*8JiR0D`_K)4Lh~zL(WhsEC4+<@7TJicmOs17w
zZ5}g!DP{$Lk>Zk{9oXOrfrkzRf(zsieAKc*wSh>)CrMv=kO=^Q=R6ePeN#pCs=LFI
z!YT9p--&aOf5J7yz*7e;gV=W+ilGH-dyo4&!{gH!IQf-7q<%eJi>T1C*?s_kttATl
z+4M&3#{@Wh;oa?bdwmy9AS=6W;1u30EKuE_?qL=380=MKP&qCz{^fEYWv!q%F~@AR
z1_4@!1-G3}!qzvw<@!M*CEcuCnz5cPubV@SF+xAygYB&M?tYH}ph+YJ7DTzgOLi_k
zI=OKWiLUT5!O}v4kzc^IZ&j8q4vAOxF6PyBlg63xFh0)v^{%YKjh{)x3XyIWT*1Ee
z&1PTd^=al3FX!9jg!Jmp8X#7Pv+c`3&pi<DdThh9?VB-@osHDp<Ia20F=x|2W1O;5
zJMNmaCSR2!)^*!=Mk$lJ6v@BQX24P+!&r3OIn6)gB7BfRnHuKp=pfPJ(b%=R+5%B`
z!W(p#9@(U_Q`G}eCn~_r!z+H#+|haAO8(ckdH&XManMmE42{U@b60Gw%FI|Pdoe79
z;kemKr51N8BlH*BFeekCG6ZLWy-YH~V$H(+vc2b|eY(D$y0+faBnZ!%fMmlHzhSA5
z?M>V&8VIbY)ArY6YJFg1)wM+|<q0Va(i9ZWo`ILPF5OV_jO|~<+k?T>rs%72yA{j)
zdbfUF+lXjjG1yahWoeyL_T>2CX>2{n&5bk3y|}_vlqDotQ8XZ;HHmIXak&4pocOuN
z@_Kd=2`Vu08brMIqvf0=I8EtCI`sZcV31@_<e8irjFhvM4vewqsK2|%!@@=!ji#m-
ze&TBa{d$n{>qPI5-wpSL>vOSA?wwYBiDX>b>N7QBLT6;aj9$Mdn{xFacP@#2hff+2
zQit5wEy6b$B|(06jo^P?=RuqQc6|T;i5Ro7fteY7K*~=I)r$_lzg+`-@c%aCsDU@p
zptq!!0nvXc{XnoBtEoLYVIx1$+yhM)LEqIBRuDdb&J1&)ofUXH!|uM3OjG{%_O&Pf
elh*d%qiC^Zluu#jl7fRDfU>;0T)B+JhyMW8?6Ud*

literal 0
HcmV?d00001

diff --git a/docs/assets/design/hybrid_kv_cache_manager/full_attn.png b/docs/assets/design/hybrid_kv_cache_manager/full_attn.png
new file mode 100644
index 0000000000000000000000000000000000000000..30eade5c7051cc050ffd0ccae797fd083d4389b0
GIT binary patch
literal 4120
zcmbVPXH=70v-Y6JL+@PyL6N33r3S(gK|o3XDM}GS4ZRbD1QY~>NN>^!9U@>LbSYA#
zcZhTk9Vr1sYDoBAzkC1OwZ0$s-u>g9z1Q0B-ZQgjo|z{~Ur&RMnw$E{l`C{wnrenu
zu247tb0ta&;QP31$^Oa}hVNQxkBkDcb{B$_CeO(22Ml3qwxjR7qK1&7hUEpqDbWl4
zP9s!q2D?=1g@toe6$4w#<1s0E{7;3L%KR|54V0VW3%!fkY6RC532EII<Wqv4Nww7m
zqK%KFxVO>Chir?ptnzp^G>1_6eir*dUsw&e`^xF<<pSjIN&6*(r1T%a#L(LAw^oJi
zz1IMt`6hu15b+eZSpaeC&nI^Q@i+B*9YDNh<xB^}6_Nk>qb6-*ePyM%k`iL7+&DTq
zT0X8Ai<RQ%|1vhl)Rb!v$BkdQ#69pu(^!@_5eNiDMa8`n>PSNqlZeCK-={+2Gc)Jf
zm*jt8bUG?OiHUO0PRG*n|2j7AezeCnO0sA!Na0=RQ2>F=ArJ<~w4RmwmQAcpqv3>*
z?|#0%HyeB!63W`wh09>$2&1-%g_%DVMjAk7KgN(*Q<IZ89M02!Zf*{^ebaE<UON3M
zykkw}KUnb}&9`rmOGoIiRjt~;I*?aTPg1}q8jdIPP3-@aw$a_Fu&(}UE0(I#N{X-&
zzu%dqgam6FaG9UX6qxV5H2uVy665efOY<pooB3r6LKGGN@>>^h?G}Bh<rsv4J$r#n
z%unQwaPMq3g`f_JL}Kx<#lzrG#R!48o}k0^p}g$k;^LLBF`fEtK_`SHA;{|5T17?0
z+Ulw#E7jTAnZ(P_`X(lISZqT4DJ16=`9_IewppXMVCE3}Zdqw*gV)OSq&{3-or%7F
zJ6@eq@Xnok&p-Wbnf<lRmCs-M=~G^%fLr|hQX%9Et)#oqZFUY04mLLI^N$j37m2NH
zOOL_Bl>Fe_kV`Vj7-M3<#(jVp97G9)Dy4!$e7C(VdKW#0%&r|7@lbQsOS3@uj$z$w
zyL)>y-);F?dk9WrLp$0ku5d}wmNXPvjFmc$2;w=kB<_q=_z3wE70q$A522gUxO!@O
zA_94YxHh~A&8-TnpMBR#Ac?U({^d_lLE`m7;&X#RpgzYKtskUq&U`c)O#|NFni8XR
zf<Pc*Vwml3RexQ-&dtqjVq|3f?)c;cc<H(6X*&l8!d$(_*Y{j|i!Gg_Y*8qbkC#^&
z2E($erLCQ*=;!S3?{8;U_~y-*8pn}+UnbSEvNGVTlA}0cXJvi;$<9no$i+cVVc{KR
z<rcv8%RkP~&oeSJH&$0~D*E=ex4-ULS^B{o7!<U-v-7#7MZ`76)ehfAI#b2%9mk^x
zWUoh;!pU4hx!gB&md0ocN}3OIg-8&-*-#;khBSENM`4M5BXkd(>bv^JP|XV;#YvXO
z^{2<R%N;J?vj^cPC}q#%M-aFpzMF-!NdOBqpEJYKJ3Z>0TQ8pNjEZCSsAaN}J?7=i
zccc$;-v=)vZ=LP+RIei&kLRNnd^MO<rKF_ti5|7}^^hE;na<A6(b0H<cw7%C_#z-M
z@Vpzv0R+J0)YQq|cL<!7-<N!G1|&*oXlQbB^766`VD#=>z3ryJTSX8EaK_5cjuNZ{
zgK>HO{Q1+TPodDIj&K@fWf3x5*{;5%L`YZ|cC<A`B9SK7Bldtmij0hObab?8MJNaf
z2?+>v3=C-BVjNvSk}qC)dU`&2^2FQwXJ;e>9UUDKi4=Js{85aVk@4@dAuA18>Aj5-
zk!s_{iH+NDIhhk_Qmfi>;pwwi&p&KhZ!FJ!n<uU8uLcXUil5CIzu2X*8Gs!xN5HEB
zn-n*PMKMku^K4qaf`UDzPF$1UmIJ6%g$MNlEzewYj^w7JDf!7LL#`JJY;@TwWJsBh
z#9lSC#zaRT9ri5E<HT+DM%dvMMpY1;u(Q2QLr2#=T3%71PN|zN{U?8ywPdkwrirCx
z3_6H(e)J~|```zuu+65128$+NAh2j2&-V5G=uZ*T($WfxEh#JO>+6e}_Vw`@7#P4*
zmHeEainoNrPZ}B;w9~{5I0Yu$+}-`Xy{mD!JF-AhH#aqjii!OoNT(0>_V%iyf-UEz
zWM#=`8@a!|qul@iK-ser1OicdKh{-!Nq4Hvxb~NN=1sa5_D4um-b=mUeJ;EH3eR8Z
zrOHKE_MSFRrUw{f2a`QWXxid{vB9_0#8h-Mh8&JQ=ZK(0=MaYTJ6!D5<?2I1-VlV*
zg!`9>)f`KxDRs@riU{+mN=ko?LJ33A#jjtyNit-NN1ZNl!mz2p_%kyPVQ@n*{OsE@
z2sBH^?MpbI$WP(Ed5MXMo12?iSvN&QZn{j>I1DX&H_p|$9iE(Q90YKPTgZE_QbxE#
zAS3PF+Fo7<IXS%G+^VW7mBCB(sFt7;ZZ58tU{c^e!#{s|15xqf#S5EfR#t7m`22Z(
zem)CNNWE{KpP#R&2)jrR>J|4|x5A~Tr;FUaU4X%8!%ZzMkpN;+q8AqzKY#w*)uk2^
z5<*?Tw=MA2%-o!Zho>MvU)YL*a|SaQd%TX7-t%aXGf9nN+;a6mAAg7_EONOgKHScD
z{vp2uBR3AITD<wTVtNdJv1?P*yNCDDUH{?f2o<I#5(~nzDt7B&TinoIx~>p|S)<E{
z0(6bTz#qeEEuR)JybFDWpjm-ZOaNK(u3b`jajQTk3(pQha7l7u><z`$t3yLWfHU3+
z2?+|aadT%RBtT&>n2U=G6bkk5h(o(LIHcz0N|?#T#Kmc}Ewm@O-@SX6_Uk_zbZ`D{
zx527qcFE}9zi$MNqnnYDl}$}fF8rig4bnF-7;8lmFD@=hTbLbBE1R0m69u)6jE&vg
z+(h2fPb@U~VZy_^f#56~LMO!PTz{-8%o;5lj>}HCoSYwS^ql|hRBOIZi&~q`xBc3m
z(=u~2uu@Imilr{HTC6o9!3%65ykkYCEA}gXvtx1m7ineK6w?A!t6#gZwhso=iY8QA
z&I5zCoy(6kE?v53dA94qLsr<cQaw34qu>;|BM@!vf{U&($)AUZfJ&Vm9f5&?FFywu
zn46;<?cC=Z5(d^XGBOn9<v(O(Fk@i83~^9R%?t7?*Y+wE4>^xTTy?csv;XdJHpt5#
zQ(o@u>@0sXHaz^#2$r7!Zohi<N+XW9XeBew)h;&a*$&Kw-L0;kicwYPe560Na&mgl
zmfn?|IpTE#^pS&4ij#SHnU;-?hp79<2p`*ITTuX)b$nfFF|bdts1U;Ai=vt_3cL*T
zGk&GhTfAOuX4ZVozE&$=#`bM|>sZ#ToKa;hty5N3XmO>KHk1OZ(CrI_m}>A0#nos6
zuy%T|k|th6x~S1aAU=nOhhs39!otGXSUokht|^SEh}FyU(P^wL4K;Nvud*`?#>vUK
zxv>$+XlQI))J&9em2EEjL7H;ws!~CObU5AxgM<4L1#@z83=9lxY;5Ew#>dA+MMZ&?
zgjMT3Vc|T`#?sOkhnjl#g(hfc1W?UnWo6x`KcR8B<U|i}b*+xBWkKYhT&n2!fG#WJ
zWRWshfA4aT3r8?j4}w~HWS3fdBV(|Wy(JL5yo<?|B}Ctm<)Wo}8?Udq=MpF-g;g-|
z$K8!!!hVAe+Yo1Hf|V1-pPN?n7L!E$@`op*9U0U!-R|oGs!a!98|?2lt8?WVh70@q
z`9-nsk9y=ft8I^Sqd)#0I=+oVSlb?4ZEa6n&k26((fgPftG3|#jFdoQ!=zeNRAgvq
zXdw>A#r_orfS}u4T{ajQI9j4IE-E4-0tSP{#8A_$92^>yM@L6v3zn^jS8%wT?stS;
zm2^p_#P$wsXh1%U(b>(tSL^JH_l#m7{Z5jngoKQcVBkWXr(mqZf;spge98CMwJ8be
z>S5&&qt}dEFb^#S7NtJx2xbiwzVzO+MSBQta{!i_l9BpmAO|*vk~E8oLX~7uIlDNi
zQMTNZ=x^ipXhd9H7y~>rq+8oqCd8zwypa8aDzf;4c$eq>8>*VZtl)tC_Q65g$J%z?
z(QJ*4jc~&JBT7Fw9B5Xywzq8?u9p-Se+4Q;!+h?@1|QH-3=PE)HYL}GvoU35T#Sr>
zVq06(R8-jd`W_#?C<6lvi=DN#we4*HDGZKooJT}crVRjKqU^IytGbzH6+F`tBp2D4
zE*E$FjdyHp4BO`A?QLje1h`m6I)DBAeFEvNn9zv-3=fZw^)nN1BU{VbH`p^EjfE9r
zArnWqXR7G^$h?@V@F$jK`NS7~scX4bsX;gAp`bSfg0KKgKDt>(G6BC_I`<E9wRdsq
zrN4!G-Bz<)3LZ1hF++|a2%Fn4r;@y+_%o{$AwXNlDpVT8r&$bo3IH6;$!QFa|C)4L
zUS3}5)z7Z+aYIwn`ntN*AqT(@Hu4(v0S7jYjvMI@9GzWU4$m(9w(&q#kMZza5Bs92
zqhrzNZ4W?(wl*pv;)&Wde6znbI}6Li+}!rYhDH{%5Gy-7fDLK*N-WmP$SAqFnfN8_
zuN&Ol&+nN$c@puSc6U!cZ|WU{dZ_xMW^AWB<udq=(1^)|bl%E`2y|W{x7UG4#54bE
z^~jPSVlWpjNu=Z*|L!M|(XZ|UH^2}RJ!t+hD{c29*KGeI<bF^IgElnul0mK<({G-!
z^$;f%*ZK3-?YG4vq^*E0l<4A$l&!rN7^q}FQ&Lh+za55#g~4~G4^9tk*-<iXvkzTe
z%n}k3fG%V8@XSJ-<FXu`kzr<UUz9ORBuZqYryrbM<mU@GIXSVguw?K?GBi{2-%SE4
zV&9VIrZD{s9VO*-OVGZu;->s-hLE<(%1U~AdO<<K>gwu{02zsJ%LEx0{5|u!_Z|3<
z>R16~6TBz4y@xe769D^K>MZ7ZkDMke_5Y^CvZi<~j{C=ZdHXK-`EWX$T|W(>PfAY2
zXG`DkF;Z1k)z+r*NzTj5yA@~((gW~0Bjc2OK>u*HK-ppNgM!ccKYP(A)C5UGr4$6E
zh^<XwXubv*7(zx|SW+bQ-NEmaQRM!cS^i(R<bSg;E`rH*-6I*>uQn}>2G5XmzA7sO
zSdqqw7ho=Lzyn{}at!9b7#&bhNDD8Q^nt+9SR>y8h+TV0XHSpgKc1|_=^hOAHHzgV
zYk=myzSIR?q${$E39R^U+AGOuLB0xX|9@!rzoE#Ne|e7UKlyl4Cia_aYN_j~RjOKt
F{tJ^y_ig|H

literal 0
HcmV?d00001

diff --git a/docs/assets/design/hybrid_kv_cache_manager/memory_layout.png b/docs/assets/design/hybrid_kv_cache_manager/memory_layout.png
new file mode 100644
index 0000000000000000000000000000000000000000..bcffc27a716497b2b280be176b4f40e520dadda0
GIT binary patch
literal 63113
zcmeFZWmr^g6gCPLC5Rv(A|eJT9Rkt{h)9aWPy<SL4xK6~4bmmuAPqx|2uSA;5<>|L
zIh1sL&!E0vob%(H>pDNr{Ll^T{lt3Kx>wzM0$$6%BqqE?h=YSe{7OpV4GzwgARL^F
zdH5H=Zypm)C*t7jzJ4X~T-hmc^~CeDmP*pu8HVt^mVFTJbzkc4W`!R!Bc%I3U+KXW
zUO6VL=fU-tpa<IfZ6Ay&amo9phtxBe4&26Jj;p=ic^kA)vdj0pX@XhiY}xJXXY$}g
z0>t)kJ%Psz-q&!U3wjJ+NJN9X7V*WFPfKR1%X&ivc76?tWFH==u|OP$a+Qu&?eb=w
zcU;$#@`g;Y3j`l8emhydKbJVMYxqllFPr}VE1ds}44NS&YO<Mca+UK~UCS&qF&qiq
zl?HxEiY%06qvlLer558b&9NqaA_C!i&I0k(pU7M;^ETILX--91jN3w5bW3*X11dc}
z!`GEYIbWUazcejcx^jL7J(X)qOG|DdbmO!Rl3)(4gcVzoJT11E&UuYOL->o}$CQEd
zw(9tbY%vz&l`PC%`R3zy=;&hDSEUtizZFP|7HuP9i0S01&*ufhZ~YxUSXux(I;ZZq
z15=cAHqxRl-Ey5o@Z00N(ytSm5@KUtd&<Fhx_|$Mp)3-aziN^Ts4ETcUL0h*x>ibn
zh;(rO3_k)xp?+_MUZAH1BY8RHDua<8Mz)vR`1y#XLS0vaQd3iR9mL$O5)<bld<!H3
zLmL_zBy}R1nK|kj8bq9HDjloFd{UvjGZmZZgDoi?A+HWwMNJ=@65)aYo5Ogs3U&mk
z?ET_ovy_=Cs+&l@T8vX~GJ5AIGofr061oiyhE$G2rFbh|Kl;71!!Kvgprynv!{0s8
z*B3UGBa6;?VO#Xg*Eq&(B}LuHuh0=fBPxwL%w3JW<bEs|%73oT&5GnZ(x`0B^|z4`
z3jE04cBhf{8-zY$A}Nkcd4q#lT5S0?$)?7}Ihrr)B%4bM9EK>ci!H=1wxe|l-?tWP
zEB@(~Qv{KRsfuGfk@DH|H#D|v_^lQ4^T}oy@-E`=iJi+DM@TI?Q|{Tq94usW(`IKx
zmI2vuF9XMr`|qFYfb=Xz@v<$8ii>;Wpv;l&`do3+ncrWF;K=F8{~5wfJ=#D$x*)X7
z!qvq^V*$1V6Nm)sy)2E!DZd;A153fd@#Yd17UtxfITm>Gq*==kXUyl%S>w2kR<bfC
z+YK8QDl;v?G){}l)Kgz?XpT_eeE$+b0-k*K{YGHRU~TEBd5KyOCK&wBn=mvfuPJc%
z>Uib+<RH|bY?Q$vvw~y3xFyNY*>T7JQ%l<F?eR*bT;*>>23BqI8xwb@zkKkqDm1r7
zv2_@hEy3VGf&@vC%|oU<MxAaVDe~DTA&-K*enr$0i#Y7zJx})cb$NUbBlB>sWvT9K
z>RN(p@uI`L*2Xdc@pj96-D}sW5O=&QQhlw6rBl8y=O$4y*I);f&7+FfM~vRdYT1lC
zP8JoMWFS0Wr?)(SyVWS2Nw)PY5lS(cSwwwn;^vN=@RZ^T`RG^}v040fmOR?{<d<Vv
zF6;Mc$JMdx2L#T<PuCX@m=GZ<Iyn}P>0+$V4Gc}I-Rs{f7AOcWTUcQjF4v`JRB{}@
z7{64=oY*RSRA;GI9{LIujpu1t!8^EC;_onzAw)1g7wqq;-FL?uK9M&|2TGQJ@l^VQ
zmCc_WI;q0?d}>gHbOX6@8sFq4s;9rL(~2*^-c9z5s$^UbYT|#iH=c`^DLfy?c6ln(
zuQ&OE!^4CO;rvNBUR})8l7se2jXJ7;<4XlDRgg~p1axhLKry=t6V`+3wb#V#MTAj(
z6cpY&p2YvDd83w4USr6vutjaG)=6Z3@tFXf_3Mg#u%boJU;nKrba}!b9@zhVBFp#c
z(qL&I{D`0-vVf-A8=*t&#LnowEGve7(q5cjfzwGJ_+4!B4ql3M1$Btm4{sLD?bT<a
zZ!NB2VxIa@TAK_JFAWAxSTWK&W)du>@!EPmY0k||dRMAA8cDCDyyh<MVR&(uvZY9j
z@>`yQ^PPss@;nH%tpE}HfxUm<4J)Uz|5in_p#F0<WchfPk|9H>ioiJ5Az!`f#Dm5-
z_}XLu8f9pWPjMOj^P*VfbUF$DB<oW<a{--g^Sm3Sq^~SI#T$(`-s2e&HnLt*vKwrF
z-q*wF;?UrN`JhR$b+gI*7oW?MWvI`T!ZW?*T0L{&J<3yQW9SEw)51N(b(Y&$*tv@f
ztUp(IAiAfVC107zgwMRNYl59>^RpGzS9P`+CC{BiuL)-kCrF+oI-}RHY~GeXvi-Ak
zrmyx-N+c5Lq2MDrhlAUhF;=<8^6TzYkKHdf5q9sJw{8(SQSHpOqql>TKX}O-`vcTZ
z{rRbs=><o{e2@LgD@;vsad8DNZ_{?i?K^iIJfhJ#6g_C+$o5MxXl#?!ucE|I&;65X
zMD+xH1A20>d(8g%FvWJLr{>`l{vQA#_%g4{#ESoVed^X57RscN;reS3D+$z0h{d@4
zA?a6Uo_AMMdZY}=1+L#TxxA+x8<f6~mkxXC9R0anU#OTE%_cnEaee2Wkb;FmZM@WC
zb8d|LHy-xhaGen9#|UO_H<=IUGV<&Np2qA_OhTm%WEQo-`LxJ%F({`!D=WaP+Me@l
zzpDOB&^PQHOJ+^@^n!=QUy+lO_w?GYPnB!R6&6XH@Hjy--@FR3l~xsF%tV^>7KQxA
zK=N;G6R{_j#Svzx*!nz{C}ruSmZC|-@$J_VW6Y(FHsa5A+Vc<k+;8vY!ddRbHdf7y
z77y=5e`FWkQIR{dZ~Xkz<Cq+EW+(S2TG5E=oF6PVaB+bL(6snk4wRoL$javM?V3)G
zXjfW<hKAC}E|ym_CdnId#m&sTD-^$A`c5NNRq)fdx1FN}sf~{GO$AzPBQ8RW-j65)
zx8N4{Qpv@D7vuNKg}x`LIdh^Ut|$s_l%{c<m#B}E{UH^n2_xIwRDhk44`6l3%ZmPu
zmn_4t2~JbT{p%D4vI+`SB7TrSecHIOAln)iaq+!+7uVd6{x7DR-<iHw&>Sd#)MneJ
zy8FctId5B|#$t-ceAHNf*Tm7d=q>Lul;B}H-K|@n*5*eNCETzZz3O-wOw&iaEUz=<
zkW}2?hY21}VDSba!Nj}?(8whD>^Hgo8kyZM9v){4*&#SS?UqYP()1KJGK*e%t>9hj
zp|q=372RK96YlLltm$8rHv9$PRpiez?{gm#b~Q@+Mfbgv1@C1qEjBbHB}zU!U5!yO
zu;fF2lR)JU10+{mkPALoYmK}Tkxw~X$f(`laNM7vMc<)pvHAE{%Hk<w#znA39H{xr
z0Tc2}KIr=)fzDFXig+~2wnod+N=|M+ckzM!Lcz}~4$%g=;bzFp@MZZyw~X4DAb)Oj
z&;tQhLP~U~XWCK9AAPD?a^twp@w2j?KHNavBKMoAp6fAYXq@7%Z%SZC<fi)Z1z?54
zGWUljROKFS9k`{tM;UKk9ZrZ{<$6AQwyB`?_iz@km<shvaHP(=1T&b89m7nw$ZWMA
zj8}^Pu&~8ApdlY4k^FP@aNg2VD6yF(lYmN_A@GVldl?Q-<s@h{UL?ytoI5kMk6EnS
zK&HjXjcZ;9tDHUjyGn?OB^ev*2$7OU=H#h~8Gc?|l9odmX7WdCYg4gAc;~#VoIHeu
z@`R*SSV?6?#d|EtuFXffqAVAMK8lNt<>2D2+!>JC=?IU>tn;d^4JvARE1U)m$PziN
zv|!XVH@qiu8i4WqWAFjllFD+S_Co{p8znKW1^L6i@bHwn7v?$h8ZTb8-OgjZK=P6%
zczkyG6PP9SHNI4m`MB!(kBTyNz60Q!LC{E~DcA(PHM)3H*x~@+`70Em?=7fiC6H(N
ztH_(Vw78b924?c62XZX>zf6k0N@i{V_IN*EaHFs0J?t9Opyb_8-@>FO@A3BA78Yeh
zHdS1(bQxEve^}dczykn`(O6?N#DUiUHa}v{&t2bG3oCq58W1KGc+kA6#w8|bP}ucx
z_y?2#VvUUsK0rXM;Yv;5pFn;V>S_D$_)BvyQ_0DyEf^aY@u8!8*|m@ng4ARte4k1W
z#U}%KIo)F$Upx?#yg7rnG3*{FdDNN|8Qp2%TwIhqtXmQbVnv&<(c0gXRN;NM=(HRL
zdCJpC_VEYTx_B|tPSBzQH|?V<)*`2d7iSro&wiy$pB7^c`58;DBO5SqI~tzMJY^<a
zSEZ`o<C=x4$OzhAcY#{MpP{ah<ytcZ5GEj!1}$V`uT+KG@oDY_dGi~1j6bn?ot$vF
z0G=NJpEAwvDOh)&)-ciE7@E(zV~$7CDre^TEt=j{r0aSO`f;Z|*VqqC=QGRuFG`qu
z>kkOFALH+T%5jBODE@U%e=Jj1x>f~yZ&IF%xZ0kie0~)ES;JWJ%4F7+)efa^YmRG!
zPn+yy_Uer@aD$`qFU{$CbmzF<i*L)DOYkM&6t-s=Ahf#-FFx2ff*cjf$yv(ohpM&p
zK=_FEWA|G)O>bdpf{!kM^S`j0RsXf6iH~gHfY!D^<dk(6asBTSy31e>^@9VN$|M=y
zy6!DzGfWOzLGrx3)86D(hYHUH8>hbZf8?~a8Yh<e`q7QV;Vb%C_6pvvAH{_21<RI{
z`q)alnD)Ot?9CTGZeh^5<8j2hRmUOrBdgGy;c_#lJooqV!us001Gi5@Bfg$0Z9yUf
zgO6?K&K3@*?*8+1lb7=NtlnuaI)doK484~{{Ez{+mT*!5#VfcMU3eR6H}_tdiNX<m
zi4KDL#5+2u%HT)XBV8`c=gve!(J<PW;6rd@j#JX@*`(F2XS;T7-OH+?mw$_IWOtoN
zZ9K0qgJ9|dzTFhw-)N~t^UWEqMfHVxil11i=26ZL{!NbkN%GHQf1oD5*Uw@9|C_64
zj!cw+xw{~%%escY*671YV;vpoe~y7*0uC0H{2|#Yta5@+>`=}hl*PfhdjJ2?6?CyV
zfV?$?mK#NQmqqp5jR1^Z>`N7AluslTW6>x#?oWH+t9fxyTng`e4tQSxIU@<Jz}&I4
zvR`b18T{|u;ermUKTGslDOokj-!^`@Ohg%LN%H02DxhuL0g+{c?GXZmE2PYS{1pxk
zsy!~c=bGNz>vVrq3FnJoJAIKo-~BxC+oXSF=!MBBq`e-LLt^+>s9r@p{QRSRyRr~G
zCAFu2gbynO(_K=K$qsMQQ%HPT^0x6pngERKJQ2mg`Sw^o@u|&hbAVoJ2<H__+9qqr
z`A+&OZpDM?6=?<DhWtscL6$iNw!py1*u-*!=t=NjDW4BtZYH`mMll`mCr`wA#%Bgw
z!{_s@O}qF{=BR=e!lz^cuq%HPuP+G|WCKu#QG+1c7e@cKV>ztVQ3>SpB9Jn?FIDd*
z23roo!GUX~tB(Vr<)n)HDZct6=?x=UF}(Z#Mu)PwElP^v{Y@Zna3;Zy2#|8$SYG@)
z7%kN3eB!G`>}J6M%9j&7rT8m>MGstlRN9L`eKAHncruU$IH)93stuCAsb=5Fv6z58
znSi<iqxVyX)tmgZ0zv=wF4`N0DeYuKMh93gFNM$w{y2s$KKSps^01yb@1FWhCC!(d
zv;qNtAL=5Me*n^(SAmqQy4ML{H~%gp_Qf36b$h<Z@vr`u1Q`Ao#l33_v<)oFo0kB(
zv0EA;57oNhkN@{6FFvEVk?EO|XDk1NHQk+?DhOL74XnTS=C9)5G|b&2`_Q6&o%8R~
zyzbf-F#oSN*tb0|X4(C;`1c@i27#7*=wz#v#kX1g$;pr{e&b&mYgzN_TiigYy8{2l
zPJJ3!&$UQl@$#>vNu#w^qh37dbNF`yaabg>#Q)M*;NSg!;q19bHU~4N`nQ-LPXndC
z{$w8W`FF!Ra06IP<=wwlYV4~eA%?x<{l7b2#Xue6z&wd0toQ!il+4Xf%TKHE{O>yY
z>VVo~*Tzp9FZu6xWJV<r%J65%f4k9`XW8xe0(s*1Z(!m%-D0e`VT)l)m3N*0-8UvK
zFW;(LZ2YHu1AN^$SaTRMmHyu*Eq<}wYXmp^JC|pI{O6nc{?EpZxoiBr-<CkU|67a4
ze|vcMPwW!k;}Sy>N$fL0=(ogsuKPF=^NxrkRDN;8;SU<Cx=rv;!@&afYIoc!!BW{T
z(P7MFd!bwKR}l9?cLJZ)WL;UWi0kR`K^DzrtG~<pLR5cP#5d{G1k=>jpR98>O0pjw
zHA*>Yyynpd)3hf1r?DznMLXW#9MN{|hKcL06_oX8R9mNNp)S}w`)5ea>z-#vzgj~P
z`?G;rG!Zw^$*!31K15S{4SR8x_<sfV4C1si1W9q<9j%&l>4b@s8Db`CsI;sb!K@Z=
z`ZC(T>MK=8rR_9Bh1mB&gR;o-2^bqsWaE8}9Xi?La49>*<1mrmPGCYvNj_QVAY1fg
zNx}0FscBWeQ#*BnnL0a}I{EMgaWalL+3*vcm|1xK&)g%{z?dyW>~+%~$Mx&(>pI6R
zI*G2Bnk1_?=JjCVXPb0qn|`9ljVFUTXFEEct9o=RZjpEYq3EUUf>f+PMSiDV1ENxw
zjKASoT)#Wk$&NT)NU$;uRZ=h_{kMfYh_>^*=V|X*uJHc*Yi`p`bf+7u)X1?!jl$jI
z?cOttBLaEn><Dq@x{%HNVTI!SSRV{v#Rz04@4B&C)~j7@O-}%Y<|>=ceD}c!VQSFH
zM?e@>8L!H2$FQ6~A(a7yZe~`K41|hKa!k(MebDcWPRgE#0dWj({%V!0Fyj5EZS8gd
z(W!LY6y4`8((U6AK*@Uq*Xs_p(fOfuVEaJBIR+C)1F>)|cZl{pMVFLt9&7`F%(Mhj
zC0HmNw^Ovn`tv=2=hMR=;)_Ka9*4-34!RzJl>)?3ErNSIcHd4ldTu#y<p7;>HdO5K
z(EI61L7B*INoeMq$4J&sf(_L|J3Bi`c0K$(#a4)uT^aWJ-&>vRXJE2y(plkk=UzO~
z{pWUtGC?Cco=|)&-hg}2<w$wEJDy9KF<IH!-4)CscRY`6&}h$t7*CU<Qv&RA<GGDU
z&isK$zWL90#+>~UJxxE9s?azY@jQVjtu0;f+~)HHwS{z%t*Nsk&#5Y_l-%8Bts7Nh
z|NJV!c7TGL*vU>6;)Im|8hE+`Gyq%|9{LZr```L^s<B}3d_=?XR%e#RfLyudpa8Yv
zKk2@*ZGRZ!wkS?#>mlsCH5Y#W<<-h;v)lZ(b93#HJ;ii~eSW8>>t~a1T@7g#06aZe
z>f-K&MZ3id?v<Zyd%6u7+u`6??DmO@s*diV)l1Go&q~_g>h&2kY=%x9Hm#rC_0)Eo
zNu$Uw;uyd4peF*cR<oEM^AvG<8hqw7zeD9idS){#>aqJ<5xogOdX@cbi=DWfy}jq@
zuOMJL0*Hp+12Sj3R%eD^g*O!+-S9YB(Gf&QhG%ydA_#DBcCe74({OMLS%`yEj_eJ+
z0}+S!dd>)Z41OL{vz>x*5Q8HQ6A;G;cg&e-rLa^Yx5pmtW{T(OPL~}zrQW!KbhbsO
zzQ7kocl5KB4ZfJx!^hqapkouLyC6avNat(~al-d;Eg%%w+)Lu6v~Pc-im(T{RSI(4
z3mlSPEe5#+=$iqMy~}%8_Mb6D47MW&b<l~?Ttg8UI{*RT3=S3&vNX7eI43A@yh!2-
zlzm7qF5N*Azq;~QJJ}@k4VS%+Q>#4VMx(zR-A?Ae=dCu-+1Bm26b18*@23EcnZaAF
z+&KJLg=i@!W90#xJ`Rq6{%U2?N9K`Gi*XY;eLF|B?a6e<4wvk6C=B8h8I`xnh9rJY
z6(1+EU!!8SUVS2u{c)_^a3zaJ(?A)IULwyydP(b(dAd4?5Xb9X_b{m1q?Zd_{F!`H
z9!F~-e5SjzV&^4T5O3;LGvnaUmhOeu%JI>^GA*Wig7SWlTKk<wYOfFK-{u?4XZ6#m
zY)SQt;K|%5o-7|O$^=A~_wV1&(tPju`ftdCbMv3?_<wix%<pQER!TeqW<Cx!@yd7b
zI5YP^hLKt2T!zeb?b0d^F-yK*h}M33pbyYmTVTU-e3zh$9Yn=7Ti?w{dT(>0NMQ*e
zLJU6t0G#*!=bLeY{zSQ5l=l;cgUz{6{&PQ%+E8Nt?_r@!z7H;<C?=o5Upi3rIneM<
zPK>_{XSioGj$sw^e^^fi#({T($ZNPjT@Oy-gom{~S~0Hy(*F)N8=UD9!xzEpz4QVg
zUgAPss{1%o^v5xJz5C~Uu~&;%T68%Ry0ZZK5dR6fjjz}Z&^Mm<ci3Y(vuOTA!+jCz
zR09-&Eucqa*eT1P#1xgN18RQRNP8U+a}DZ<4?9p0`e?CHeSMB{9GT>Ef&efP!!M)*
zMI2IHV`C|^S0!pU#4hm_Z+~i;ZElt*n624}QI)bUCWHkY4?acnMYdOqKE^JQ)JRE5
zDOBVzXQ~Wf+zD`!!YOYeiCLMRyLQ88ifXiqRP%>SE2J}LYo4dR4x=%SNitC1vY|H2
z1#@jPtvx=F1SMnk0&J@&6N74S+Y?Kjo{X9h-{^dJ-bN5e>OpUtPTU0*JlmSYUaSk{
zO{lQYY{;l@RH_AS9%5o*AEAw&SWqiLU6M#sW9d)`CQSo*CgEjz@8gWNPy*7)pXXMf
zpqdAq`Bh*KNa##IbVg1!ue4ScD1mY7uApd@=~~E|V>~#nKA`=4g`+M`3;s*KcN-6n
zj$TSiQnG8C!|6A8R>TDSg6hlhng-d3uU#`^e;g%^TU?5*3_JgLmHDO}lqyTbD*0nN
zLGy+&8)W7c8mwJ+5}kUS-(|kBDV2J4%4GKq>_k3$D5g2sK)pt*r?fzA9u_fH&qR%R
zdR(~~2zw{Nn-H~`D5O)PC4X`=3%O*@YX%RNig;Bkpq?vYNuKV_rJhU1@AJ};#~ULC
zGt44cO%Ztur&tv8GYSdoOJ!9p*0z8XZ1A{>3%nwd_#UJasrMq2<ArsEf%=$hMRJR=
zO)8x_-dvqY=9}=DFX$g!UVXC<ycuOYoA8QnvHJO_kD9BJ<Q6G2jq%&dE7fR4i+_L5
zait)+zk8+VtBWk(18PaWVj5JP<y|>PA2CNB31C~QUx;>gS?djcQCZdO7-E40ZrP+V
zVo`1SYW+nP%?Hb8oKW*_IRBEy+|{!#eXgdaCSn6lQ2M29L;ZF$-SPpwo32{H#~+RL
z7oFH*R#sN*(&_^O2(qRKb6G!b&u5BNC1iRy2#K2YFbbsf?(OXGBz5VkQq$iszJwYg
zg)`TI(vu0py@7E=0oI}b&OrAv^8wC0n=^EM2o+BpqKPkx9$?CdfAq~};oI`cN`9@4
z9YNBK-tY_?RkoP%pL#ctaR%d++oJ?e3wum%(b26-l%3llDln`FcT1!$DpR%`H&2d*
z=Pus3@8fi4%0SZ+?Y5E<rae+9tP6A*ml(QCm(7E4Q+Uu%I2X^upuT%z@UQwR8i9et
zW+f0%>4e!!@&Bx}q)w`2G6T<A78Fc`@sc=QReBuYpOextov1+@db`&??A*d%J(r@A
z)+7^nl^<qm{5er6Pl@m9fWx@-?0qU%i^KUcKi86J)wUP!$E(G?v)M?LsxlPiHXmi3
zS6V^(T0}BlIUd_bKo2xdRhH_sqEtB>(`>X1(@68-5(vn_FOx&^*vMUdd!BGUyO%+9
z5Aw-aPmIx|f}cL$Dv{nPi6uSq!^>lUCZPA%ZK(7~x8$d>N;tMh-6KC|Nke<RfJfHi
zjd<;V_GI&&dk<@6EVOwOh>_?Xq6$65JxXvJQH4h|Es!fb(Z<?!0xj%zQ7E*HKhd0*
z##Xdva!_Z*!!1^rl$d(F($0GRuw8lxCLZz5dfo-jECf#Ve?xAxY9XrRe{)mU&VjqU
z5d?>OtE1ezMZ&JTw?#ebQiWyZ<km*ZUZMgfdItxUB`hrc3mgV8Tk#F4mYb6k?wzA9
zW28wnSu~_NV;V&k6Lz4NBV)gHP$y6pMF*<a)7|A<@X8HAxADr>*4FuP11{=OUT5Q;
zGgYQ24tuvAv2q`==UI0^H)AV&LF|pafo6NI_|>un--)M@?K$c0NoAB<U+$&Z*E&_*
zDLe1ZyNRriKznRz9CL~yr&mpNb?;S=5B2)n8SpCmX7RqlJQrR+aBFz|=#ra&P^gq(
zA~>^d!zLZkWwjAWPLA95a$`g7()Qo-NHSbys~+O&Pd+JahY6B>z?kBjFr;yF+dsoE
zc|HM9YLS;Kxpr43YrOwOR#w&>_uq=W*#N)lTy{(kwv&ZjL+%JK*NlFmU<S}egUWXG
zdl`C1*yVT5VOBSfCI;RAlCJ*raD|FZOU)6Y_}U7hvUNuZKfGPv*UwLU@^$U{N@m#J
zf=BT64msHP#S}`#S7=taxBP%9a+#8tpXm7$MM^5g#JSJZg!BN_bCo*)a%25K?nPVV
z63qH3>q_CAQMzjW{G<x-Oxh9rGW)&<?h!4y`3^3i49$fGgcB@rf~o!f8wGXL$4?>1
zHPC*rNR6K~Ma|!ccm842qT64r@BzYNFLHl*G=}O~?2GsF&M{Ed&=&sn<vZXIj|TDR
z!jG)u)lysf7h4&9JwH7!xT+?co-hJ^7vAT?1$Ra*F4$j+r|*XnhocOz*jeu`V>P<p
zqG|asRs0BAHIui(t5uP)cMnOtbm!#c1h_YMR5w6YTEnuiqpSYMK#I4t@EBLt^~I{3
zHfO^fYo<0Ejxc8BPl{OwKR?5#x5}@eS+=dYz2K`+TO3n6VA=Izy1u4v$o^j7SZ%rR
zlh3>`9Qq``9CCq~Jh^<qsHg%A=<wU$zgS4}NLFj$HY1@;RYxfc4>#c>IYNx-PIy?#
zZ6iRXXgsl>HvhF}i#WiW=OX!GC%{96JQRxQ%pJ`cgGL*Gl%(e9d0m1OCE;bXpv*YS
zbO-Ek)LqWhox=LA7=AAz?b6Hg#+MJ2MAvdR)HTz$J4|YvnSGpMi6wc9Y3L;h<gjS|
z;L3Sa<A)KGa5;*x_~~S4b;fm$>KGZwIj~lxjk9P~Tep0oSQ}$;`j!4Nw8PsH{FK}t
zPdW>9h!}qaQ@bugy<x)RNJPaJBrcD>lR7`4su|5ABvc@xvgfN&=w*lLLmgcg@p-!Z
zg?Vjv)V=mrequ<N{K$KtPtl?HY*~>pF-Nwrm9D`ZQ!`=o+?&Wiug%b&R?vl}f$HnI
zxoIUoH44fQ)Y%SH{M4drU6Q}+g#L9C0Uj?^tf*z_v}LDaM(hARWQ(zL$gx;}d|c4N
z@yJYSOrbRzYxYu+D>pQzBSrg{b8E82$Ye&ibp$Lg7geO`wlM+d=WunKyHB_DQv_=u
zCp#mZ*GoOrC%1b=*AK=3(BvHTUwGQvSm;sv$)A_MfmbEQp_b{w%Ni5CU&}ogi%{hP
z_pb}nEYvjZPh24zNV=EMEkGsBEI#+WkF3hen)Nv1`aRwe)z~b75*bv$?(KIy{zgI>
zyOlb|kLa$Q9_NR@7D-vx=S)IEaozJ@7-A4tj6FGXhD?)Pi6Tvg-G|r9^N<WzdVl0#
z=)061ZAm215bB`=+WT0a1seeNJ@}+B)$KLS1J^0{y&?B@cF%*(jU%gY7H5-L6$~C~
zdo{l*_?;!Dymk=&Lb2y)7?pJqe@Q+SN(g8q@U)kmrR7|l2($0hkeiA!L9ff!yaILy
z-{f+1^6XPs`>3;sos!<Bq1Nh9a)4|Y^u4}C@j90Jl;dKAGy<7N@|k3^#({0w<L1TX
zt$1>ju?==rWgW9EffXtYLe30k$rm9jA%;5%$83a{y!(46UyiQ#zPVbs(e19Z(C~QJ
z@**mjfP0pMft%|3K+<<%7Pf)Lc8wdeY~OE4$I}>I{u$CfZ@ZNq(XJnF7JG<`Pok>N
zsXVyzy2p=~Lh6WY>1XTDnyaAgXH7H&vCk?Pu;Rt--OjY6W*{^-uCpWW%)_qd8JWG)
z6EmD%E_m49I>xmfzUK}Cvi7C9pgH@ghYa^gnqonHdsA0k74&$1h;}BjeL7gOt(j<U
z_TJV`9tjS5|2?MPfGe&?%M+ItG}%|K3?;=0vI}J+Z{?}2sxi%Fya(N)w>&B0f*UY_
zwVE&oFpl<5;{yICPcpQ7D=oMkutCDNf>P_5@1QJU7G?8pd27}RaCz!mh`8BQ@Ii<0
zLY@ym-U{Uh&o7b_ED3?GX2?8D<I0Q&l{!4ipNu}PyS*t_P@Y5eRpqV8);!(*8U1*1
zad8{t)WjX7LS~kjPTNu9-XRUzcUjR&H6!xD0vSqr7!F7-r69v>I^jnK=AqWLWeU}6
zM0LKLl@{5gA25Ug7*qdMHcF=#<K42fv2h}gynlY(G(=iFuR3U~q9YU2DqT%G&u)mW
zs@1#f3=R&ahl((HtJOJeGKwC&+8NEweJs94mR3QQ2x7P$k}B3Ap=bjYCJH0>pJJ(B
z8hk9qZStw#a83+qwo=XyU7%vU5%$=YA<{%3c2Sm66-$yU;5N}HR0YxSgj9=urIhIA
z0k5K25beKiwuAkBd%Hz0y{fg_sLE}dT2pL-3%5WoRwiX>hTl9-aG6j({#~+kb$UmR
zy9#t1Ai+zeTXkB*6THKPB8(lJniA#0eoBF3G{XIqo0|(tMAVaU1JbKs>MNlHzGa`@
z7OL94p#lv^-PDpi>H*S$gJm`O?1!|b&e=ir7Z~;(S2on7oTCIcwQ`j|i6=qQb-Bd#
zy_`G(=4|;9n`hx{aI#!V(dQYT@Ohs_Dc!#^0E1jyEb;{jv6R)+z~rex%pNN@c3lta
z2m5Ajm<m<twh$t_19*6nLGECqO&=D<CS_zRz&d0rotdbS;cIz>qE>Hvj*@#_yrnwJ
z|EtApnY6{}ibIe6&8l*F`Cp6FLm*X}j)3ngU#?6O47otdUFwXQe2FpihDy28)wr(T
zcZ|&NmT>gp3&UVNA3gAVC!Gyv8*D$|_Xw-jCT&6LItL>yTCQ0sMl=X6WN#17vVK(+
zW-JSQ+O>`G3o8ognS+gyMRTb}2G;X(Hl_|w`-@#N8WVAjsw&M_o7WANj~{mK&n%K7
z@?RDjVX@S}Gjv0iF8GqoU6Ol=ZFk(Y3852hFzUTyZ@scJqO*S0-6u<pCGVrAW-pt7
zS6ER437EiZ7-$ZVeAJ)>Q|emb&KPr?aDHeGk=YZ+sC?)xE=d^Q{d@Fwak?eO8CN*m
zmd9prv$^t+AM$K#3U(u+U-E5I7M627s0g<w*WY=h#)Hj`pGkm;JpdZ!n$`VykZNY|
z+c<WF*Kul1wrjpl@t9lygxhZh`9Y4V;WYEPpC??}*p<A{hX+^%(gZPGyrht>p|gDB
z+7%i)y3iaZm6bV|0PTVb2m)#Iam~|e6BGFev9Sp;CSZ9`d!Xkn>Y^edL{K$5i<j#u
z+0bdl+dpM!#V~@T!#040zry+gDtm&~oslEW?o}F)2Ru(k5YN3~?_MM!O=9ek2C#@|
zQnMjhVrn|Nk<#y#SE~j|3Nd$;sQF(&ax>hi28GMExTi`~xXSrD(A$lS%98ovSsJ+6
zvPDj7$9W4{9^4$h{UrhwAlbJid$yxPgy4As<bd<#g@zcTH!$6b@aXCa+iXgPX)~C7
zo^DAFs+9+FrhkbZdhuTO(xBG2{tknT7gqppFCbC=4)ocLAy}cR_`C;~ZHGQrzk5-6
zF@>|?iHoeqf}jv?+%R+rHpb2h@^e*P$t3zKT8g+v(#*Gr8(5NVund%~XPqit50lQP
z*L&0O8=K!e=mg=YEFZl;x-jfWvbXH6e67h&=h62V!qH0Fvw($*qysz*1Bvp1VVW#m
z{dsW`Zwr1gFnTe^bYk9(kG1|d87;FAjFLWB8$K8n_67-xDQmffX3x&zWJCQ#>N}9R
z3TIUU{*1u=aY0o{#+qxQDZOUG3owWgS94gPm@!qQ!;8!7%?B>TcDv(G57mgfPdZKp
zaa%t3(`SHKyYvkY>QnQV+FKyU1(~)4$h6zeuoZ*Fjjr$}?w7JvX+%D^IS)H^PPvw&
zFTF;mZo@!pp{KlA!LBML*+)9Lmhh%*mYg=-LJ>#W5YPCBFwBj2rri^%Dj>Dbd{bCV
z|HVh7kZPgYPQbC?S9^QQcZqAJ!O2};^@kGR!4r2Ay7K~pjM$?8);+on%RpG1B;OgE
zRX?Ab%-EUqjgTJrIDsVjdghegPOYr%)Sx4C!pznO*|l7N1bLNHKZ9c9gd*&<pz!J_
z+0L^I4zohqxyk}M*$Zw1V6TaR{Ld?p9G3rgbwbk!-E~3b<AuG_e9KqeI?Kl~P<xrp
zgemTyJ+an~RL8Ra)lUo$r$J^`|7%weTT?Se#up|Gn?KB6xypKDyv#@e6<e@9_!bku
zm~6aQVIg{ZPcG%i1L<zIR1@)0hcn5@+3&VJ#IdX3a5%hYauPmzXMo4iQV1eroFm^I
z8cAK1e^r`6a12MB5kJyEm!#&FS5VsA0Aq6hM?oz+U6#0afNinX#K1>dbS>40je#aN
zIUW>|sQ4=!m#!dojpI9m%HziwG%#_<n#(Mj$X)jP2%E+psvM)si4J!$i9L_!ZtS>d
z3a3?`2{)T$xRzHV>%0QVpV5pSkr+Tw5j_*JQ(J{wdPjrV9ieNTRtKsML#%7=t+RZH
zY{~ayp1Td=J<+%c6~G9;Jcz$!ShIY}xIS94a%{&Ieuml@O~8znbi&fZ9`q`wxGql8
zL7iuKuAz?7Rkhj9j>f1ed8$&X)k}1Jj-E4Kh$Kit^LiwEo;vS+#Avb18G0l;%@T~Z
zfmqXRD>CKxL{=)HVDE1A0ZJLp>^-Q1>W`hOFPyR{t`(40rSi#rZKJC5*-qSfftB-d
zu?(QtA}u)5)mY?ZWvkhX#Zvo5MyN;$0eflsdXN1csjRQM1v~AQpmvok1GkWn)*cF#
zpJ`k?B*1i+pAa+PRb-9mE8fHm^Olc?vH!l0z!ze#WZ0rb)W7mLSv_Ftz0!twacLb{
zhMrGhPwXtf_{>>G$Vv@We7v|j?JS{KkiVuS)Wa#zYMIf@uG%BC-9^qQ-<E;~eY{Y1
zN8@_EE?ywJrLN$f;y49Xc40Ta-7lnRUw1ldirpIOt&*VjfzbrgQCq^pTb;)mIXo3D
zy!&ffpJsD#TYWVHrzQ`+Y_IfcyNtcnJC}L!$?sbTBR<Q6<zSj&hf?7}O^+Ey9oL<z
zEi`PGJOQyuk>0)JfjxxjBES4yrLB>QDs9&x;uMp`c^XLQq44<P<k+n6nFHoAB6&~#
z@<1TM<)GiwW5W4(8MG!B(*i#@&TBtau>*O4=-Kf$w(5yi8uJ?UsxMS4NkbqIsofbM
z?CEucjFWy19%}==EYZzo@?^zn0Q91V{h=LIpy8*~uz`n%H>7hSc5O|2peEIBs&=S8
z(snA^WvdSNcY?bNYSJoUTCn4xOmvPg`HN)U6)`ycdcu8Kj_U<AH2dbff=`~@fLese
zu6Q$_MXTG2Z>EjWb0PZTS&_uO*~hoCt~|jlCM1tqu&oJ8sG+MmCx#PI(+IQEBUB}n
z`XdDGiKYB5He@_F5GJ+dMx*@v{IavNQ8QNJ?(-~m*SFLpszWTS+_H8Y0-w9_(PxO`
zRo`b~yereKlP5A6Ezu?CN>xZ40<=G7YKzsPY&ap~j>m#hr=J~P+nx%t$)DslsW+@7
zl>s*V_KfQ69kh3zj!J9-*uiSOQqQlw(+xH~IrH1}&{iSXdZ+`+1F`#7yd*xpZ$~Xg
z+YXr*=*mVa=C(#Oo;X<C3je(_g_=mF^h5p#liz=77)Zq)9hh@P;%yIaV*<aVq@<W-
z*6A|;oT|GSLi{y3oMWANO5kZm<guy91ygB>#k{-MXB*h39VWMZmgUlPh7>n~`9TH#
zZUg>OA1+3NmpZJbsn)-gsaIFPQ3|cTKCByA?rv%&E$>HiHzV+3p)cl?ZatbH>GK50
zO4&@Y%Cp<oi`q=QA(DYKU94QEjv`3X=g2m*lEU&*HB>3dHMT_#+7;k|`-z_rAfhLt
zPsA@))R5xi;OOh?6wcT7CY8JKz@(P2hK`l_chL6Wo1n1%cX(p(8jC5AxINSSc|_tI
z!b>4$OdUt6yZFf<oG<6&L~e>hn>d8UP9XdEXTxfqX4{e%=GUG1JO-GU|NN3RV@BDY
zJQixmJ|pSxHHs@~PDI6nsvp})bBnwbq$1@I;Ea8Ba#1~~_~=D-m@M;JfhH_AD{rD#
z18)$hYF@$TtzzyhT_#Zlg?7$FZMRBxEd5#6%;jvx%ZPd2l`BPq%@fm*JNhGVFAqUk
z>A~;hig-f)EX^IABCk&B_A9#hiq-lhhu8}Jg38@tv#ND!RD)4@E4)0aUf6o1X}Q?U
zs-D-$mGSGvzT4O+C`=02we+<qMc0$JYdasI(S4UclL{QN>7)q{>^`)_ja5ON>7)AA
z5P+iLI_KLvc|+x6*d?kxt&K<{y6R=$Uabkc#ui~Jx0Bk0gr243ToDK#tahUmbfOQi
zu)61fbEyf6i?bLDN{INJVHi$Sn0{=5GTkA%RkrGT&@`D?yYO_3E_`F3<K8`PCA_(7
z^(PiLz0TP2NQO|7G$%}tvR1S~MIE0$`nPYAc6Vxslto*apNF<1jm7W$FR^TK|25ra
z9K5jaR0tUU=SS56)1MRc+uk##aH0sgCx}{nVaT&zbmQX@MNiaoNs6>Yt8MGC9jR_v
zy5otw3dac#BhB4P_Z7Z)*BN0!hl)&G|0p@LN=uD06-oP~nW{wzDAsN4R$dAh08A^;
zigR3_1VjVU4_IQyUDZ{0zj%+eNQILEcVgVPZ|u(^<q!@!23f(ZjjO+9cWSb9js_Ks
zmZpAs4gYB~j)ERmRzlc=ijixE5EK`YpS3Ub$}+lu8(mfM5M_E7rQ~|@Wl8m}#@_1B
z%;;40k0+hTdOk}vP1M<(<EcsZG5@DM)Yr`gVj3&U&Z6%@_Agw8fk_XJmE2hwkP)@n
zRN-@5h+DS1HnH>dc=YYI$JXk_B)3`)J!n5IAc;NU;?n)s5%Z6kHfKw!Q9KPh7~C3Y
za>S?qG<BRjwg1{EVH)+r(?)HN>w2=`yDS3ZyrY!5U8Gq#-derV?&M>}O3h@9eX_t=
zqRa<JW8&SZqo!IR$NZYJ*4p{2HX0H+!sCS4XJey_6m4H9uQleKC=Ypb1gv3Aqels%
z?IgEB;#wesI7M|Q*AAIaaWiGxBkzf=jmBEDlg|X0EO}*NrG@iA6T_TR{CU0I&D43r
zFw<}Sk@<<OQT*VifYxiVmKVa5H9k9>g*u+CW@dZt4j^f&+=gzg?zy#}h)%BUedhUj
zRu+@ou;=GUK<~nK=7eHRx{CQhY&H~JVI;2GX4-13IJPUCF6zAdK1Jy0+n8qYjwY4;
zQL-W;+oeFp69`V{3}Z;&4AgxPHPAj?1fStv{xMdT)EcJp?VRpU$`00)Q}6M0_wae|
zMB$r;mm({l+X(0j=3|JVik_@36A%#{zE19<7Sn3gtn!Ny^0fEhHl2{6!PQB+$$9FM
z9j#LvM0UB$lbTHQk;f+8aE;@N(BfX+TUOPk3a(6)d3wl-mg`D|sp;;{y5(-rPc}@=
zxAn(`{<4-!2Mr?=<sJ2!=;sKCy=>r+b)l>YUY8-Zclm|J`!?w<hEADs?5t#k@U!;m
z`PUb}b2)Y_tSY(c1K5D`Z6}J#i!9c-<8fp{r2xnhX*WH&%VL*p-Xl`+a(L3E(3QLQ
z)zsX_<Z(tKr7_%Y`g1;1pM}sxLF%}gml3;Q?N|sOoabjwM7y>-1{nJgN;GqSjSP=-
z-A4ndIZ+~OwjcNAP6$oOGz5-PFyY*kIA{CZ-JEzo1{%U?QQuM3?+T~tY?E7HnBU#S
zV%<<XqxGaMSN3K<!Pm8*M_m4<I^S|X=zrc)&;_*v;0kWRDZ@Q)>~dAxx|^O3lvsd3
z_yP5s?U+*4$7vkuxmgSBa@{(hM6u{?CR)FcOj|qnaxfFB;z1;6yp%BILc&!yUUJg2
z-CHwfmp|OWw*zLK_%W0Y4F3Mkb$b&?)YNwpjl+d1{Ew8A?cmSF%<Odwpm)tLWY?TM
z^ehq}5%2lO@E;(2N$^fgxV9*gd{NPdbiB&r7eLUtV<xKbjR~$UH#=ZEz1@{oC%(7P
z)O0J$BQ$+ZBoY*2tMHT4V)u6m+xc1&-|ad%w!2l<vW57@?T=1O6G!CqpeF*9&p5E~
zEugQDgx7~oxcdz0fzr!Y(DxtFIYT5^UWWHQgByV|!@B#NC8!uD97KI5pa;$$lUi(P
zk4}i(u>l~mLWKrJ<?TJXN&C)1KJH<j$H&q$-lQr=kN5f@Rq*ZZ_Cu_+Wl>MroeEYg
zYO5wHT{|Nt13!#kd}0cC17@J1US=4)<ooPg{RIr~qVAt@1BAaRwRap7>hTlCe6`KZ
zv}TC`CMptuVsd({0tS{tT`xXj;^uN8^4|t!ZQ9eA>=&JS(Dr=+hW~kk!pU!H>iWyA
z&W(CdA5s|?_GNvs)<LJT4$#I2xTO!L_yf6L64W}`xlxzC#d_J(Kd@NLPcZ6oT+1H8
zrw1Taz9-+l$ojM=QayRp9(r+^FqJ@!N^HQl0q*?64TPJ7^jL>gkrL6bX`RSeR)@K{
z!gN9A2JWSE(t&??DsyxkfR(o(UMiYS?3om#2tyt`G#ffm@Ia?H<k9R0JE@e_seWCQ
zU~~r#@S`0ZAlox}bWDM;k~`*o?HyIV4Vuk7LubzG`9E;V&UpvVg60~wKxTnFx@Hy%
z;*1(TW&mP0<Ss<a0}QYS@P%joc0GwOHl>H%9hnuk4F)i9mfN2ASoz7vnpK4P^0gzM
z>|#&HZGcs{G|Fq)7eIALW~$s|;aIZHR9*YF6R)Husd)8pNz{jzq!;ne33YuB)y62o
z0P5`nETnz?hB5t#0`7LVJAjy6tC={TmUr#=(GC_pz)Au9-umwbs2!4NhgIle!SuuX
zWPX{@_gUxEwPqQl*-H+3+UdobV{9~8#9-QSY-m?@<)|Y2xb&+-u(E}~s-74;3kt%6
z5kzfGfEWkj@lODw;O`d-Zj!KD2=M8P34AfZ!V|3R|KtWtxlf&q!<ff<CmkQvE=T_%
zk^};8gw7cPp6N*m`fRHov9L_2SNeultHuK~DzQqgvM0KfY>vfrVZVkk*X#=p^VDlK
z>3j&5)zGs8$=BjOq+%0gdLsa8Y)|=Pu<`KmaAH012t%k)8a*h@mh2Ayp1vIfwL#sR
zTC8%E>jia|bj2)ovwzDI9JU8Jb=C2DLc`Y7Y3N_ucLvGi<*NvbKyV^<<i)xLX0x{i
z9n7&L9Y?%wGOrLUOnwn(SX<W5Jgk-rBI^MEBRme6k%h+Gu2ZS7g&!cRohv?Iosu@P
z{N1%wt_ug=Y!KzVXAglfFsoeXE}EmRZ%-{IUTZCr=&_4<+ACpx21~lON}N*vDkOzM
z>?hdhaN^LXrCk-n4|^;(SH_<9F3OR(TNsNbKeSEmPZ0b;s8r_Qw9wXX6nb*Jl)g0r
zt(rpE>==V0+wj+Hk3(5h4974}0KhL0({LgSy+HB+j-|_Mh#U0iC{(con<mhzI~$*#
z$F!nk9!b%^bn$&vo5rnOH*)e;z%Ko$5#9pR5Iwf~7!zu#%@+oa#sIbr2V8|^E5A!y
zvb7W2t(RjGYGme)C6nRgC8x~MQ?@JJxPmdA2-9oesDxIQmMM`YGOx@hwp4z=c^mEK
zRrii<8hLavWc-N5ZdyGb>J^vC?{=tbmRiCZdp?CZnzGrf4@*FlOT7{=aq;|<rHM-)
zK`}5|nbd74ZmD(tn)fnv)O@&dO)6moH%aIbx_-Ig#Xc)8Oxw}z$UZo{dqvV;QH*A^
z&e8`WHS!6B^@&WCVPcl+RJ6QRe7V4dyQivJW3wzYx+^>S?CcaVb%)-b&<H@CXb_-O
zNbP4rHR4sPCS+wckF(<J7<Fq;bp&0HVlJ<a){B{OR9x+=XDd_+XY~Gh<w#RU#lazO
zOaU9BO@x}6XrOL!vbJmoPEK&D#@a(Y?T%aE<4%N1r<BhMAHmpdn5kOBfYInxJYyBZ
zz63x4(XwfwfW!h`l+<e}@X8W$>G#Hlbu24j<cOp|=s=qRbd74BzhS&4?Q01d*;;H>
zW->TgSFc^mMlNl5&fw_2=uW+jn`w>Np~Mzair_Em1j)4yO5BspxSnWeQ@wbAHe^M}
zb}xYpLx3L$?D2N)e0TRdJXrpkCSs;%soc20vL-Hi_OiE*X(CmNoAsDAYW)}~&zP;$
zd8M%NbYbD^$MTikBNTs<DuZg3BoNC5GG?BhKp0t-QC_ce2%0!$!sNauUG>U4@C-94
z2>cQfnn=8uaOsmQ#sM=`S2s;HUp<<luB0mn)Xxf4nWLmDGz#D3=Fnsymzw8cN&Ju0
za~`FCo9<&hjxOx$xDwc+J1ehs4Bn*P*2(5AQ)7v@R35Q)sofEN0&8IDjF`IFe!_g&
zkP+A$NSV<G%3!}93eJWq^R2s?3+z$cjVc{`SD`bCGqW`caVyRZ{msAd2G_8FW0d?T
z<qh*Rp(bK=lHn~YBo5h53<M`*Wcis{{^_R9yOo$%i#V^kQFAO-^2!j@S%!uUrQs)d
za;GTfstvKWflBc9_Zy`L7UZud80`c`uez=eR-|a(r%JN+?Y0EEP6l7dsTnUCUq1u+
z62Bsfdk8(qRl=b(2%z&lcQcPuJU>qlk-Gar-27!kC;02}k0#Z%?1o(*3;OKxNd6?Z
z;=GeV5T@8}5;C4@g5dB0tW1@Ol<#ivCs!$>aR)%&(&qxLe9Vtg^GYo?Ga5m@oDlHk
z1-pDtxRQi&z}EptXMzUaShWGUKK>)}Z-A=?cw5+)?Xz#?)a%>lvGjgUV)1T6Z@WcS
zCe>vmO<Kw`qdQ9r-sFOJR3r8wgkADnO51_;-lSL%12xLHFS(gUdB9pmv_)@nSV28H
zs|IN)5eLHdK6r9K^rgh56^(LoMwT$L`i9P!>MH6U)byS$y?4cNDM0rdl^*JyTR`Gk
zkEbG!I?xxjiMkAJpI@OigdQ#Nfa=AWlVh9uc*Fj+^`L-VniVE~^2y6a&P;pJ!*4yU
z^RZ5@UbM~(1huBcsOtHAa6ezV)_A6=Xa&$qgA18IG{^UXd=pLA;uBQc={{t~*)h}+
zmFauW@&6ol7K}1SIN)2FPN?z~)>flGU_C8J1%__b{mG{dKChUc)9K>tgrBdxsnF<#
z(YUpH%i~|q3I|<>8u#vbTaR8sdND~4o43nf26a=wi|?<$-Sp@TON=a{4c~UEoivV)
zoGG@p&)RnSAi`i{IKI2TU#)6A)?8&-+Y@vAt#Xqbwh(6oUJ(&H6|$P7_H^Ta<$2+`
zW28+&&`(eT5J{}1pDQJ!eY-CBr2gz{*Us!P@3XR*;n^xjk;UUfkmT_GjOFMNMU%xf
zw%>xgM^R@FcimdmNyb<_>B44d68;y9F`ipv3dN&9J~Mlsy&+Kf?OQJIM%4b#a-9+@
zmYJ!4h0$y=`EZF_{=HhL{ZeL1b#?!xWf1ya7dr+#(hq!*2A0^$!trTJuE^eMYD(u}
z1IRzkEH!u%ASO<|h@{epAFe&XQ7r*N@A0CjC`QNw5chRp{V!-O)FDpTlHA^R7Ur3W
zgdF(lf_r<!L(^UfKqR&}x-;HFU{D^xDGC}qxV;_TG9<U;UjOck!lU@co=Uz>d9A|;
zFj^9u=J&T{+`LHvnN0++2m)Ku1aC=7Ew@a}7L7SI%=tY^l6M>xjrP(^eCegp$zB$%
zBYF8Y>19T_$T-MKJETpk6ulM77yov$`rR7K)rHyiS4e%eVHZU;l#@Ih=s}of9(zqS
z`a;L>m$5RV-j|@3czXIZs?NV=pTse0uL>~d=92VL?frl;l!z=O4qmyTghwHwYW&L|
zQlV;TY}{7=-auWRnDv)-Le{B!({|Immi&QRa%y-X`09+UU!xtZ8TIJS15!@R&}Wdl
z;6aK~3Y4)K%oRvccJ24{+V2Sk$_9>DE^8kkNqSfgjnR&;Kf(4b|5PY;HoC5ZC@2Up
zuZ|b{(^)Zd-~}UWeWJehR6l9aE7lh#e~G*_>}8U0eQ_~+1h2go<c_iz_}YsDafZlE
z!F74j&z%PMd>MXLu00UJ<;j!JnM=I!1h8Kim59n%DzYE-#B9&Z&fVfPfiQx0wT15I
z*C0T-m%Z_)ncCAe`2GYL%zBg9+s0`DpMX}mHIxpq-f%Qlnhz+wAh;-CtrZKLeL68z
z!)~zLI0}m50`I(ysFN~O$v77)2;>XXHXe6I`1<<JExqkC(|jWQ^-0?HLx}DSnDgxp
zpbkOQgt6>Vz#t-^$F^}FftLpK9dRv2#;blu2kWe^X=2kmF+g~f#xKc;SsL01VN7;P
zgVqvq&)*ifA3~pGr*o^pjl~I5>Al?XaV!0f`G#}M!H45G|5SR@wRamhUEs}Xm@UA*
z$Y6hr8$Vr%o5<A2Nb4Ld>4_LyAO1OE*a^_W2V^BOsyx-Pe4N5(j&Q)o0hxlXes8jH
z;Wbzk_VqjP%3kFvVPPQ38e$*BSQS#{Cvc>_;S9Mi#2EYRCKBnX?aci}OYw~o{f<&V
zNC3tymyUA>itP(F>(G!3G|R1u2{1amD_B|Wm%h^61)JeXGU>@n>)H4J?Z&V3mE{_?
zm-+yo0O}YBBA3251dn|*{J<;WO4)=z11!c{bQMug6S{dnQ=iiY^VxV!#~lEr)z$1b
zrYiNL^nrC%pIzDhL|lHH7|Na_co1Sb5>O{KXP*Y;@L+rxCC#63kaQiP^h-+~P+dIn
zmy7L1b?jxT#)^mpYJ$^5be2gTpHm!X*Y`Jn>FMdQGQa>0*>!i7+I0j_5dask2{Ex<
zR2a4UjJ%kC=~mutMdWRs-n@h@sY(mM$SM984}nEvX=!Ze?SpfM^aX7Z5fMyX@pcyA
zz7mZ9!qC1}ZtBN~PKF%3*~j2TdRzX&#*zW)eDKmnddOO$U7j}MmPXWB{=)qzY4FVg
z8nqD>X6^j<#{xj#9<VPk1TDa~xg2uEb%7`;{Uu786#2e0OYrXoQaRV|0vVFp{iRQ%
z&q>s~gKcg8pw>Yqbbw`p6l>SREosWD#qsI4BLe~gyf!Wo2PjHqtAq^J3#+nU*wG-c
zGlQ~F&eM9NzJ9$fZ&#DJt$f~C#P}BiA~Z{{T@?n97h9Wjisf#9Hc$tg<fqw@34plV
z%AKLaqe1$Flk>Vs!`<Ewf;d~9lB}J8$Xr~k;6A+lWrC4ypGA<_sd9d$xPQOBa&3_;
z-*Rn^_tT-<3m1?7!QPvPQ@ywS-&9IuC`yuwiZY~<nG`aUDKeMLV_4>~43U%}vqfey
zWo98XA@dx|GS4!U%+v4uF73UqYhTxWKlkr`p5r<0Kkn~wwD(GDE#KiYoS)D6e!os6
zcU#DKE`f9u9>6NjA%1CgPfqA_3`;NMj50U>zA>At3%;t!GlH3C$K*m^DD<z>M^hyY
z=a`O5)|r-0tX)JYU5NnQbaO7`jD_aS9UbrE#Pcl#;#+7SpO?{0LHCQQC1#OH9JkR$
zSBwtDV9wJ;?W|R{=4V~)c}!)LJY9B*4y`6nr%Q|TtFxRL<DRKrS8{oEG?$)JJp&*Y
zeCip|mXg<BSYV|b5UuoYl`;rgzQJD09|PD~*Ubdv50cL1IdJgT>guZ($dPqF@5A<Y
zxGHh^l$)6sQAq0HFq%?td8P|q+Q&JM_%m;vx_a|wj?hT|`+8y@?;LGC>h=ttlr|#G
z>(ZgZoA(}=@a$?lFt3yepa8qS!Hf|iBMMzzkCJ6l9j#-O6ON&pMBUP1aYq}ftKN!+
z6kOI^Z7Am1P)^X<pj3{uZ|)}c%FHE-hngccxEe(2={fDO&21I}XG{DQMQCxwwy#fK
zyn&a<hT)Ub6a#tN0-I^D9|_|$g}vY}FM5h7OQH~ofPun?+3{Or%;<O!vhj0qjh9i)
zt%#LNkiIt)6@|05V-kDZB04F>I-}U*vv`3nH!wE4%ha0g#xte^T2X{1FIB6@)vl(W
zxJ7i}VzEU7VZv=Px$FG21)pG<hHR`q|8!EaLz@hrDQ92=B_%9Ev<FHWQKGFr;lFeW
z5gBFntq&l{%~)#9u(OR5A%AO^YX;q_G}JVp)Ek9XH&Jnmz+_&0ds!!vAXqSmeI}0P
z{63{VPpibML|PiXse@m6^D+0?Pn+EltP(2E3dLSZTuO249x3RzbwJiou;KK|T*v75
zR@CZE8y+cXNl*d7fZ3}Ji=``_*sAZF6bpTbjtNnxTdam-?X4KnL`4f*=aSBDrx!nQ
zIfl6!Q2E7{C3~JSB1oP}Q7MUBE8FFs$+AGrEJ-cr@~v{1rmBc9KQOkh+joU)-wIY%
z)zNNMjRAcGdIZ7T2#-D2H;@Mou7sbffJsl$I8Q#g&8feQsPH%o1g!#@5TX%=(!H&6
z)_F2dj{z&}CnKVemBi3TjLik2$gn!~LU-q9L=BxnB6SAbm>(6w-@maW7N}EYD!rX#
zZ08_erJMjIaTMdaq8iN)!9qYN)$2d4H7_>c@)GAT%%*K46P-_$w@msL$AO%a|IkY;
zNPyd}(OA&gkqe-7v_vrEen#Yek<j<%Pb;KK+C@+uP3v|VUvkR!Y{(cOIaAQPoJreS
zFMjnM4RS(ypDd(q^q}_ZSr{Ap?Zc!OMN*3i-E6~0;Z24(wWp_J)H76`Qh>RiH@NlJ
zuR=_oMrDd6h`7)pIrd~9&41{<ywN%x4d=ugk3T1=LcPz=cXr&NO$4c{NIjc>pilM{
z6u+5Wf~?u?pb;*L&?N+Z8muXsPs&y`jVUxWQ+)isb>T(9yHKp36lbVjz7iTjloG~k
z_yB+`p`0=}Dr5goVdT;B3)+q$0ZI0){PiXt2fQo&rCl-mBDaeuW|31zs=(rs7IQ|Z
z4_7o7>iYd3GQl8GzJ;19$$!FqIC|31B5Sdo|7}8+ve0cs8Pr|}46dL<q-L<;^V?xm
z6E+KR60_7{|9KLte5J5GDHUhOA;d{SUor@Wt0@tXLEs7vS;4Y9%tx^DM@;16Fp1q?
z0uNU%>rCs2<_#9h)kU0`5SQJvwTvF$;ITp#ma<6&kQh-o`n%Nc!xrO0-<0u~Etnph
z!FFx?WiTf$Wl~-!fi7@+k`#Hytr+++`pDZ-E&*4MYtC@@uZq{^9{C#TtduA(ublCE
za@2;rXmQpGBccAaLQ3;nXQjG}8mB@z>)J+-{O#nkEmvk?^jusYVa?%OxoP77>A3ST
zJy9@tcIpV<ciA4gg{cKm{u8)T*n21kvumplw|T|75r@$B_OrdX@NS&j5Od4TcX-A0
zZ9O)fif`LPFcTL^?B#kuXLrn|M7IAd(yGx3xWyj&xfC5&>o`B5W((2`r3_x-Lo{T3
z%qg6qendwCx*3y)Y9dVuBaD0^`eoe|Qq?SuFv0{^usg(|x=fv(#>9y0thb<EUT?o?
zti6a=cUAvrd*$A_J%n_?b?AU6eNZ!KTd~kLW?Wn~Nt2ztZ}#)up<5@SjmtE$_eJ8A
zYv%$K_wTb4_gHn0jo<$?49FnTi1d8XbpOKxy3dC{(ih2IM$41-bX`K+KCiqf7-#MC
zd5dZYHQ!3iMc^8n<xhyz$<H@>TUrR}){Lmj5X0kmV{&%APNJXU=EhXgydHn;o;N|+
zp(RlM$E!tTxJ`|`lU;Q9N~-NwSwyC@IdK2DtFvliS?KFaOgl^Z_?0t6CayQnwJL@x
zR4t8+y(Kh2!5Yth9hz#uQKiV|GMCpF+v1iR#0r}}<bI>f5A7OZt1_1Dep4U6Ju=<f
zTQA~g&dn+S=pTo3X=Xj^$k=ifQEay^{eZyq!@@<vyBYnfOh6OTIGPkB-=w9%S6<T_
z>fFKVy;$B+Y@umL3EI_ylW8Ak4w{;_2DMCfNyqEuUE-v+7<+ItzGU$D?a*D>?i*6$
zHoY7uwP&o(C-~l;-SiAiJU&u%h5^VO%?`?ukHv%}=Jl?$G<{HW^!cuzfI{oum_R|i
zvPk;b(1@8P;LdWU)ZEg8Y}ZeHHtz-PY@8*K7ES*cbW|XwnCt-&;y`6nTatuO!}!Yg
zUfW+&ZC#maJ#I9smHvagpI>$zv<rj>o!HBurlGItke(HO4=_^fZ9#XW5_Lq<v@PZW
zF{gwsu^R`4-L$!p*r+T>aRDvw;vU{vLe3|ug=7Etm{G-HDk<)@c!Q6|-CsBwe}!hn
zNDY~UBVUdT=-{&IC%Q*bZv;-+ei5S~@$o(Qq`kh)f5g>a`L6mAni7P&^k#Zt!9jTH
z#RGJ7GzJzOor*b-TK}bWQ)B|wT-pPd5!LGC>l_z*p{97wGpb1cuwtE?$S3w>Qa-}a
zw?KDVX1hLk_+=vmjH=E|Y29Fg$WW1<e+O2O3#ggL!Wn@d2crcpRZV^6Fnw*J1~9Vx
zBkc5{L=JH-<Yg)0UI#cB)mg&Ep1f`tS7Et0xeK2+H9vK_Ta3EA?qM>-v)q><wzi3;
z*D9j6;gl0ao8to!4ym+`ncMXX-LOj#+Xqd#+nGzBnRh*3^vr{`Rf}y`A{jBIUM`Ra
zBc2@AV_sl1L7Y~kN28+n&q1;UmVXd6;jNP-k(0C-oWr0SMB^ZgnS!{fP%$S{EqA41
zdMtB{)IemCN2D9gT}Nl{)Bt8H*0AS;7Cw!d{G1sMG?I9s60eVzxuW+;qZxBrw$kck
zylN>u!HOxA7Gb4H;l^n+Xc@30w|k2LXE>MqZ2nPjUi2X0!K!j+t>F^XK#vsFhfdKh
zp`f9jm#<zmgb6)^#hkpz`&_0lX)u~hPn)RMr=!7N2r8(aR!alyjLw<s-6p>ZdX1S$
zdzR1c_!(L-KVvI6uG1Tv=F6Wwy)<6NI=R&KwQ8SHoYMqAPsUo7CeS}>BQgwgAK{%!
zcCgP3+qj4u2XcKR3@~=CB15I-F;D!Y0QmtOV$e6dnJ)f>`<rIz*h6p~E_~ZTg9v!g
z!!ZJR%TH3_^}fuiFyXX<_-|W*xhE~;_i2G#OvDc&74>$PV0hC9mz}wOWp^RS(<IY)
zEdNZ}-6~yLgZ&J*wmZ1)NBJ{N+iP(51!d;EDfbo^#*3C<V2q6i(!gtuttU^Oc+#DA
zQXYLNJ%&rnvNN79zGxZFrk=PWOYPLxU%Zfa2lf*lSl_*;i8&9LP>m<Yrm5}N&%p8_
z+<@q`;^WbeRW3<$=7k+IXCxQRx=K1kzWLUDG{;=D!=8AjPNgP7rDjOgQ+2~kfNv`^
z8td^cwIw2k^J(05gjjh_9UL@n(6^Bx$bUvP>;8}6SFWFO$>oXL1_yWH^%&x|ALsCx
zu8>y!#(w|Z8;xvT?JHJ~pUxh{aD>fPe7o0vI7OP(*FtopuEF2zvJ%Q>q;S?O9P}4c
zmMZNG{3`7UcM8fva`yLwGy2pq4*K-ufSy=X2xni}=VeAhzIko52c?AsGxJN%^Jk36
zkkMuYS)LoHHy?{n#Xi*!Wo<r?o9KC>t=|89EynOV?Fl=imwT@HJ(2_fT$uFrw4_LK
zIe@z6T>toBZK}22Ul|r*gTkQFm%|ST1dB%3$fiG)^8}6CM1?uG9Vq*CWLPnIYV1OY
z0V<XY`IUY#Yrd#(gZldE_URjZ^dCUS8q;*_%$?`*@|Vik`1j6B@6vk)w1N#k-}G@x
zaenlfGiP$TOg~Y!C4`3hc_UCgf)TcE<Q*7uZW$r^59j!12>AJ|C8zZCpnXE>>Jio+
zSpp^F)6ZZYMgG$IH|J}M@MK6!wuTuJuYew+DU3$P7H@rhgGJIw_yYp@b7EVf{0!a(
z>M2Sl5C>Z460~08<5s84#_Ospnz(SD?_fc#t2?;12I$x8xDy1DFMj;^0X)5tvdELY
zg_ioE)8ZFuL5hAt<Oo>UV0#Im3kTbU^2<Gov7jRvxcU6XQGaOZM@87CE9)T0us}95
z44c5jUoCKqs@>l~W=6JTA3=zBZqt2+TPFC?2<Qzoi5=LgMif;m+HaB)4M|$q4;cvd
z{KG_j%i#~G!M}qdNh!&6$*(UbXZWHLLYePQH+9GhG^xlKGVCd}34$)Ap-(j&Z7*tx
z-}%M7+~c$~5oKD>_8LrT%Wy8$20;A)_(Sp1Yz3!0&$mc1$Jx2iqP}|g8Vh+A0}T7&
z#k7)vV<N5E;58dYXLXG^^i11MMqA4CS1tqJq-uMTZjf^=E_ZUS<ZhgxlTC9hUm9Bc
znixGmP$sD#G{<(D*k10+7|j;0WcTKsE9A@Ib8R7I__Q)tqKg?*npAW7>OLN(=1rNX
zD&X|9<#H|HM3lA~PNu77)@vCz$2z<(ZN9nr9l~uRuaD0tCT;jQ0|TOX23?h#S`+I!
z)~c8YJef?w7Og7QnXA|n_?>>TcgRT|+w-Tdb(?m^{JPJJT;YZUTc_Pm{kgFiS2v%Z
z-I`h&3y$OCYMr`+n{lgguGpk#`OV!-;vrY-%1NOG#j_sk;}L_7qhwU<(Sd8E6i9O=
zkPJbH9ox(Owk=#hKS;7m?e6lUMzj<7e{|bi$*78R7;tJfmMheQJAqI@HtK+gqO2Sk
z<%95ljN1`<3ZPLOeope)PBg{~51PBc(rX!@C2vJz2X=Q>>CBH{8i76HVsv_h)+n~7
z@t1LDuKRK(jFHP;XSd#L*}*r<gtIvTgT$K>ZgXluFMwE@V^L&$AnfcaT|$P_VILaM
ztjc=HJ%6>k)w27-d5nV;ADa+(datjyc<jvT?)I%OBj^>-cajyS37iHKC56YD*U~IN
zOB{CAC+W;v%tiSw?Ooi-v={Ol2sowBEH}zj*1>VEsHYP)@2yvQjbcn2zwhZ$2ndc%
zz&Q@78=+Hf{sfvxL9AH^LWY5tnXh62)S_|hdHNp^moatNV%eCDfolLRMU1H|p#MIQ
z&Y;{sbX86f{Lz91Nl&oxuJ+q@N$7ZNFAr`_@TB&IYE`W_S;Zb7PPT0kasXVHZ#1XN
zA73j5M4aLNGUNkJ<Hv0<zVMMMCun4y5);j68Cvvj@*Baq-~nLbXg{tY>hsF2`1utg
z-KhAS9F1)Itu<?QfMfy=hiJiNaP!3tlkhz~GX7@3Qv`6$Vu0?Jh|A&&&gQk&XE&4C
zuMpOn0fGju)NBdVi#tH~Sb-b%E|}>LDAENihx?BJ;OW$CufewtRILS6wWj98jO)1m
z_`JLJYIhCZKjc1>yStGq44@Xf_fkQB;Ce=!x;v!)BLGi<!A36h{vAAg2ReGUrWLfX
zs?Al8U5Bc3ey8|<Q*r_O^<Sg@OdYG*-ClTgR(NM5fCGkIJp|1IQK5c{AMYPr3GH!&
zz4q#?2ebqj4!aTb3?j<B|E?M(`WKtaN5>8F74RgAfLSye$OGYqIAD-gYhd9;D*p!y
z3eyXb^3GS*Ts%Js{)03M1gEsUf=#9CVn{>V_f<G8A^a2=!s;F80h;4EEnY+SyYmq|
z_-3wdb8_$T__6<PI|S8uu@Cao|52~rfpHr~lecgwmB3DE&IOJVrsm6q5B2aAVHE8%
zNkbq#7E+!7XObbl?cf8BC$-G30ryP{hEM%vTf)Qq(5o(QN)rc5r%dHx##Xmr+?;4k
zIGOkoK9Z@Ld6)t5AcyhK6?Vv$T$*2tqqtj%fp@MMU8^Vo{0W|e^Ygk{t_rmP5kccY
z`0()AqsW~qPyiIqmd_r<jo|7ZBsF-&Pzz%%JO~pTXKDs9`10Atbwlia0^{7l-C^O~
zwOZj8A<R$~(L~=<%cb(coo3q0_8Zf-gG=vMBP}Q}Ul1(Rv)#43lRc}@yo_S4eQQ?U
z!~oFhL)2R?#zugs7`H?A&I!nH>r24E+WIkWlGH><P}uvo=(E)zTz9R$M`t^Gix0j1
zX~N3&v$1Wf5XADmg@&0+vtAT8k#<gJPJw}Zm<*7rT6U?gGw+Sl1g$=QK*Rpdy`KvD
zf0I8diHeOqjXxl_M2xiTTeY%?>wVce=H$iK_0nN=mj#iDi3YK|KiK_62zxxpc8|9=
z`BC^R95-+z#0;ObIv}olP<-;8Gf*p7v(6w$7!ZVbQ|>pxuu{V}ATh`6oThWb7mk$`
zb}nQo19tC=z|?^t8$`K*FJDGO5P?61XD@<CZy+xFJxBBns-x3$y4^^`!P<(>*=)a;
z8sj;C<0^6hgv0R=B09z&AOEAf^!I<gkNb`NIa`vcc0Y@a_A988M2Io02%IXv{QfZ-
z3m&*$!;LqTdLO^UK#+->I5g-kNo-x89{~HS`WW^pZdiGV_yq?;9*z*<52C()vk;Id
znRD&W!S8}wV)-x@X}^u#`YF;542-Z@g)Bg}JAlRmo+hwGZq&Sd1`(FFv5k!l3OPC<
z@H>G2x<=1C{>uB)iOqhm0jK9Q`(Q`)=X(QWb5)FxK9Li-{=>f!H`nu>=#z*)V1H|j
z@f)#mPj29RB!8`NY`O~{rpGgKe&4!Vk~C-kU|0QfkS59l#AV>lhe<>B5r5xqaYlxc
zAI4{{9m}-1ag_L<dWJV*q%p96L&6NEDUL!5fyT_j64G@Y*yJhgwHp+2rTA71xf0xV
z5V)v%v&t{pj4p0298u?wkWU*8ZS|=t?6bS~W@)T8E#dBI<SRejUX;tUz#`rr6UD=Q
zX5Buhf5F+jmcF=b-H<;v8YL_4BE=MHt&<{jaj<Ut*rX5eY?Ly_Y|4fV8yq;ON#?cd
zg$`6{g-Ge*i8kx5114T7NX-UF_(jC?udf$cV0*8tZftBo4pQWdqmitvEa-|E^|8f`
zKBMdYx0Jwc1;{4+1B695(g_%^1M^`Aw!HD+7hpnxo?4PS>e(6*^;^%z1HsXD3~13W
zI(=SHe!)_Q8`2Av$to;F6<?99+c77%2<eNE@o6CGozS|X1}KKRDomjnU8WCLhCbOg
zlfn``^p5mTwW$5!4dW>O?DrQe*&1N?Dcs=mVu0wJ&OOLE@hoAP8hM!b>-{H3l#Bhx
zPL*3s0%7X+vICGzAaWPl+lT*uz)jUpxAJG7tA7QA8X-3R2P?j)kQ08La#P-vz$R*L
zdH8(j0@%@4O@#v5Mnr@m!x%8B&pTC?*GRa2iJKjvqFZTekdqb^qF`PCoaU`M&1R@B
zk*>Lqj8x*r7uuaDJimhOs~+sK@Q2;&xX1S!Ib&3yJ{hR*PCdbnU%teL%E7JoZe%=~
z|7>>tLYQ>hoV7xR%Ge>f?MsAmf3#=N%tu>_guq&gEZJoW=v>e(dCV(%ks$%mxu?bu
zoU<vY49Q`DbV>i)Y17N>IhJ|`1V${iSA*^UPM!Jugg;zRe;6SS5D<X6*{m8=8ODNh
zN+zr8|3sS^zDsq~%f;^szab>`QB;qgr#`!iXT4A!WYE5!&ypSHEnlja3l7*THm7}0
zn`@T5kk<_38vAy?r1;1{tWaO%=+(8e$Zzpf_X)c4y7&SrM3v!U=8}kwZ*4c3S0S!A
zWUcPAPK^8d_hB8j(&ILYL)`OwJ?s-LgEoD!MaeY?>%iTd9eHKT%)nX;=idSpX$LHd
z7A>Z70QqVHN-h!1IhzzeZYKD4SsI&lVxtG<%neh?Ls8QJX+StK1()yS-8V<6BXuNl
zV$pRtZhpr5W|4i#zV<1V75+kE()JO36@Z9IZ9&tR)+T}_IVYCJbem1!Yq5nL-{{av
zX^YF+LB}(}w!APpjJbP~gS7+UePv4?K{^Kz{eCmkT9DmpnAi}hG(4U&`6neS_nZEb
z?^xolNWi8`*6;6pnKw;0b7G+hR}!0@M%E@(<nTggBz-pixv~VCsaaX^s$0<_W1~8Z
zM0h|8^lHzhSVI9w5{uamV2g!r-n=4kHMG8NQ4vkbu~ja>4E(7&2_`0{4{UOo^tm0O
zeNh(aLiw4W%cqlFgq!R#3b3m7j9R#P%LHAAb*^f+rUMHS7XC}L)`>=Q`-tBccQMr~
z=f*j#Kd2^Ja1xvUXwoBbfBReWPb>M_f1Tc?JRuhWstB0i#JO`hap4Cfn{-*c5kwgR
zk+YB@z~>0<K&i?g23z~7*kcZKmhA2>lU~ia!Y<P$UV&2HkPby`@*s+K+a1<M-qOIa
zHD;E(qq-B@?5?-O^Iorz@9^VsACNa&Vk-?0s1x>>8Q5dYH43`cmun84t=O2!6e{VQ
zXHuD=_a4knS;S}_F1TAo?^|JU*pXBM7bDlgU8+0olzFmWe%ZcD8Fu#0oFRAOINiGL
zZef{zEwQHjGq~Oaw|~X&UVbr>6C}-V70iVCa4$xcXU_h_Q{zOoE5Mlv|4+Cw`el0A
zstuKp+2X5&(e-1GDBt<lLh;M8Nh;7vz#;nu=m|<07hXvU8Q4*<@XME|u^cb*kuSZh
zg(u6nhrxz))lJCv6RM>y(JHpQuORrqM#9!o{@v7xt=UO!6}G`~X*bvW`k!@tB4I~b
zI|d{_nF0~7DHBy#e0tf&ZtkDI=e1Dvsk{Uue_Lu!d*0!OUG3Y{10p8L&Z63IQaA4Y
ziGvCLs;SMK{}>~K6%{oRJw(~=<j0>tmE%7EZRaUCh~vWNGGa>W+36pSgiP=%`jy`U
zG=5ZY_l0|p)iPgsZ;kN^u)*AtyWfbeDq^!q_)>)R)wqs~vdHeqKH-C=4(j1uJw222
zhkjTxssnj@fz6`8leJ^oXq78t`Cj!CVg|w8=f_UB9XuN>yIq*Z`26h?@j17#`M%jh
zmy<{{RyJzjRR$?3lH&S%;zy2_J`r(Zt<@J$D2Vsy^ktg`#1L7rEtq{B&#zEhZu|+x
z<Z6plDn1j3ESNJG(rhDee1U~*W{~K2duzGWNy0nnxCehRtxT}r{Qh1$@SePcWZLX4
z;7>J39FUm!`O`-CrKR1{`(g#nOu0PfjN=}JMWz2te@UD9hR$90@r8Nbap(5j56{JG
zjP8fg$=i?K*hd(s&yVYkm*@bPZQQ1ryr^l*v^AErI=7H>tzc9(=H$D#{@q9B_K%zn
zwl{>SI70<${wQj%3A-W@T;x5C_>V$cH_@9M!|~(mIK#J>LpHgamU7;Hbja!(TkC5p
zd&TqMb&RUjR1{x&;lN{YMrZCu+h=6#)3ldGFv|Kcv55FP`^AQr5_y{qN4%iSD=YqR
zqsw3W!>%G@BkhkD&pHQINXf=WYvMh)j*;xJN?`eL$6HI_(fvEtOD4skyYJHH6Xpv^
zZRogOg({*4ui-b}m`%7y$i}_;(C#>=Sg|w~#|!E|+dpOYQ@rJQ5#o$JIWZw$E|aZE
zJzimjvjD!rJoWtS&40^xSzszsVQG{mbtlq#cp@N3_&}|=-TAGlx_OfKM@~s6?IYp*
zc-HWi(zJs{*OT@dfu80&IEqKNw9g>@;3P;gpIUKWY|(F8ip#W2@2n>|<4}MDBeR5{
z%(cP~JgXK-o2mE9$xu8@vO2x%2{v&6n;AZHf@XS1pPj^-<{`<r7tO>{|22@XxT}b`
zx0Pm`^&EO|=Y;4TZHjSMW927>9KkQ$UZ5M!#kU+PQE<Cw^#1j>A=RRw?o9Z@#A@F!
z0uqWD_-)Z^)S(JeYp>gUDm)wpWx6XD;Ni;4GaFsV-Mn&**81cZSRnY(;LA^N@2F3p
z6dUhCC<dJ_cJ*h4<=7z(S6XeR(CNlynpOrV#bv30tN)6<8Y^#>u9x>HBWP(mre6Pu
z>Jne!{<~=%aik{YYRQ(r&W0o;Ff2P4Tf}JaAvdJp`RQjgSBY<wfn=ylv*PNgk`E;)
zj)oS&aa;6Wy0uzNgr5RNqepz`@Y|(>cfu_%CW=WK+BVy8*3i|m)T@5nbae0Fi<)R{
zRXDA2$!J?;mZV`kU(5cZZIP-(WX8BkUs$f27c&1>n=3&4c3fuMB4We5&YYDo`}Gh+
zG3gGa78w)44LJVUk3fT3W;e&a)NBfRKAbbaUwkR|M5%FG*j6&|Yf`=ZoAn>!>5}}R
zsQ_%0r1=u4B5N4%iz?oVs_4r%C?*C<N0u;CHJ)Sj2@U^Wz&1~YHjIzSDnQpPQv|~O
zyi*i^R@k@OE*9m-ju;)U(a4@iz5>D1XOTzuIFlKYm%8#|<z@IgiM8C@{~dT`gfnvR
zu3(iIXdnmIb|q=>i!e`h`M}9zH)1=8@jMkfi5*%oNt$F%xZ|dLDN6F}N}`{|=3lU3
zKBz6TSY3<*Q$gy>j2A`^Cbgbf%gm15cp0atU8Y5TB_rVOy>qSCfd7e%kH1%x;C`wE
z+tZ;vHQ--Llfs9iO_0**VJ{g#Xlx?Cp(@}cx{jlaOuN?t>8K0cU%mfRu1c_<8QIHA
zB-W{Op=wU7<<IY3F)WtnJyOs?=1B4ztHQ`+bYLHBrH^?a%EVvy0Ah@gW#p}P^C77O
zSk)_t@a7nQx@d1Q)D1g*+a1bt@s}1Ye@_I?)6DEG`7|<PbY$c^3r!s<x#VMg0~)!N
zA-V@<&d+=})ucUc8sYomD6Yv*vhM5qzfo57Ks@&sWd$D~9)rlt;$~6g1yuHnGh|DB
z6Z7RoH#m<TMxRS2y*v{ue<eL@zW;KP*~s~{Il00U-E|!M{@X;AC?jNmuKs?(s<BH#
zL;YcXWuIgG{C@+Y!qca?c_rP&aNOwqqw~8(zac8_UJ}u!M6uV)Mv9g)H8v)B_CGrL
z0-prBDo`dPHKI(@{S~bd>6zQcmjSmly!9WDDbW7PAr5bl%rt~DdU9i^pHur1(^Jig
zPO+?p1}i6fTxxx<Iu^A&du=^x;qk+W{{)#*74>PdMGP5ATBYdY>e3bC98Blwgj$~R
z&ZTy7ERgE5fKp(KIhx{@WKX1|EkC=+O02K2V0N_BfiL`)Hv39#%D%sxEP55VSpZ^u
zmaw1L?4<%1VhbqZreTaNj-73*baN>a{RkC^Tx2b5^X{X>iya&Z=~ss*K?_tHJ3k}q
zd(uXcc<w`YPWlj<S2y=zvwN26Ipp3`dN9ZkGHSfKmW=s)P25mMjmu$kT0MQNEYq__
zzi({$;XFxGT+;(@R2s3ZLAhuq+!pB+ja?PvguAnLqCnlCpLMjc$-8<xK_iKolTCd&
z)$%+~9VD(9N23D9m)2qD4Oyo{r9`-k*HcZboZe2>+c3Wq@dLpVC`}gtck^>I?=x}H
z;rW8gug`?W&I6Ffj`LGcuxj<H+AWc}U+;`$I`<#`=D*~4dP0<Ir{xUh<KN9*W;lcY
zBI4vfLBuwaXk40z8)75x6nGu9O!YG88P)qEAKW@7-Lp5l+igGP0OMrx+E+yfyV}OL
zR`k=~=dxqw7_@CCzt|Re`_6Q3@kUvfe|@8$pG0;Q_4KMb(?n1vVcjL^D{doEFG@dh
zF@Fi6RZUQ6y=iW<e|Uq^c|tEgcqZdAHHmw$vP(ePR$r3I70z~3%NXM_Bl*b_@66?}
zRWG)_5DDnMi97zd?+L(V(t?ybXDycfMz&TdZ$%eohK9|Ia$l3a1dDPizJKAIH2ovz
zWcBZylgfJ`M_84YBHm6X_i3jG{0X3KeZun=l>Ojs1^Iu7i~aYRYHUp@2n00lPJ!2P
zwY5?h1A6{5m{o}%y<ygI4uj1?FDtH@Op)W3bXwdvPS|h?kB*MYbG;a>PV$u|6B$U{
zD1xk6a*>!z!oQ`Q#5G0s%Qp`M$#=V&P+*?%IKOXQNPIo4h7&sQI47ICIoL{F6}E7x
zN#YF5Ozyxw%MYy-rNeRF?Hr9ExnU9s+N<vM!RDIPE_?zgV7c}&4_Yzu&1oHKY^6yR
z3S>fQY-hi2nT(ON!)0WC$&JTZESh<8`&P|+TJl}FZ}|S_BiQ3Xg58r_8`1)l>pq#P
z3f(xbO~1bK@PHMaKC$eFEJ}nWgqcPl6ZmTr5<%xmu7dqCg_DQBR}0V?X1QmxUN0!}
zToEqLtz1SS_!9`d5fclp9Tz#3{xD_bb%!?EM5?`h9>8U&gn{=dCFWZrFu$ChE>~F<
zcKT>!g#lMQ_<6;Xt-SA-{YBnws@wt#zV7CS;=oJ^2b{Y{0K3U(j^2l*d2EIltaIG0
zLnco)dh)duW<393&g3+Z_?0eCg#{3e-+81}EZ=aqFHNXKL;26aKDYAho8vbhJCsiJ
z#|d<#3Ou#S-2YX%@QLO8rf$`*TVJ@~>A`GnOE($UnR^c%3>i{G&DMv6JyBoW@;_Fo
zn&u3!#59qUk=@}LRL{Hr`f#3085M(=uCDGHcP2gffcl%lJ!}_Uzve!a&wQ$TZ&j;q
zr2H?2iJL`TQmg%l&eK}uTXQjIv&+3HfrYv7YhCW>RM6ml1|Hi1H%CWDkYbzp8o$ep
zU6>83W&B`gt|?yUE9hl?GSKYnKd+q)Qi;Si!9L@btn|KC!##?Uk;@}v!x0{c48>sE
z%0c6W#G)ES;2-tZsME!amF<4$=9ql{bG`Wc*~P+tw!EAqW<V%E^o<t%KfCC7vbfsg
zRN1N)Lvt%PG}mqUH~pIi*PgdcpbDPNZwCv=Ayz5G=;>Adym>eZf(mmxcellwLr?%?
z29x`9(_^O@tx=tw;><(CbOV5o3_MU~@U%_l`=UXHbO%-j&OjjJtqv%x^~*JEmU}vb
zaq7JCXUt}<i@=G;0=gxvGb?Fne=VgM@5b0cmgkK&LOtodS7JP>X>e<Te1VZ<x}S_v
zhXGwKWML9gxzPG+=i|JiJY%wGC-JmiB3Si3_#W5gTD}Y-k3)Jem}@qVmvg<&Dza9i
zZ`G1y?qV0cCSw*SFr#E9H})n)-0XJW=Yp9##BE{`rhgGmK1BQr;iPQ;b*Y9rZ%SaA
z1XSFnAB)-`jI{WTc7Xlo!%M)cfc83CoGDf}Zj&cHe#0)t2q|@H&FjrwoLr_|uU_@E
z#D8*{2_XSo<Eq?!TwMU9pwZ7R2G4s8xD2i*AlBx=aT#yQ=7;+t?*XF~U<bRi`3>Hb
zgX?K%u)lW%iuBjtx0uWB(Q)h@Jj`&Wbawdd_Uj+ogv9tVHWxy`6L#rK$(JYae^>{j
zb)i`d^Ua*N<uD7@;~C0P7l`q|3F{wAUIQ=$MEy9lwE=L19Rxo@lLL2&KqR^nsS4#8
z8RNL+InDIxP6XKmQEMzPTRP-!{nXm<4?wW_un+>B?Z2c>gcrv+5tbnk%Wds^fEw}@
zx<6zpV6O%th6oh@qsVj?8WMup=H%n!<3$c*gam+SAW)6T+8~xalr{5Pn^chjNd<B4
zC`$>E$94?-fIji2r2Eu_eER%onOT)ShJH#sG&B@8=N*u;Ev*duRa-o}Y<=#C(0m0i
zu8aFxsP0xS&2JDml}OO8?}c(w?e8%5BS?XzOs;QLf;g8^-=g1s^{??sj)QxnmL6~q
zFWJ9LM|`LV{b$K+&(*c|mhL9<*bmJ+-`_ER=k(i1v&!gYajWDMJv|q~#Rt-fCpj$x
z4KcH0us3Q#d|ecJ>-dMGpRL?Es>U_r=mf9_7K%wvJR6@ub}U4rdq0moPw(@B$L+H4
znhKl2S&sH-zE~qrcyM!JdsrKgAN{+dH~=8R#upCk*^{c%3`5{k$Jwseiqh}rm%sAp
zt<e3VcrecL<)QiMv7nBjs`v;7Gce~BOj>e*olghcl8^6r_Z5iv&Avz7Tg>sCID$VS
zEr|wdVj&lEuGdC)-8IGaE}X3w537)?1b8<EIxY~11NQ@d+cx;-c6w#*T7fCk?A_qf
zMeq%l3@QUa3HVkU;LX42jkw+IN>JBJVRX62(;auG|6h<Hb9q;mUF*S`?|v6pN;_bV
zdk<6-5kDX*rI)9K)@a*(H0lK42w~$f#AAI<=P&q9T$yHy>nqpp%fVD405u>j*fBwH
zCo$uMk~zp2u<itw#<v;0U~X*S)I*xT``uobw0=$J_X{+;w>b>v;783~?ztkef-X~&
z%`M=_Q<ZB%et+CiEI9kugzgo^+gSY*DTHv!a*q)5OZ)wWM#G2sf&@}3;9}*cKx^K*
zKOx^6>t^~2@no+Wn)0R#O_d3ajKW+te>?T9=etI9KHp~N8Fd&*=bJhoE4Z}Uy8Nba
z5O`XS(tfM2GuJf`cc}ntoeRJ}I!bmi-}8eOE)HHyJkCOSw53{J+z=r>%>97-v&fce
zn`T9$v*N(;U*v-n7|T0$^}{{b8X)~KOx~GOv3M$*dLA0_C{J#z(KWPy+Y(zel@6B!
zAT6p^_g$si>q!uKK@f^0a=_OPuOvLyD<G%y1zu`t5AV(IAHbz+3%*ELhqdRhRQb>s
z{%X?pVXj}aG@Pjh|K0pI_cT}Ks#TFj3oW4yh^v&&Dn{7fTWA0NCdF!=MNo=#VJv2&
zvWMdRb}RA+MuwZ&2cvm@148!j6`n6?Az1^o)K}O~ml9rBV%Gz9m!Bf10zXSimLG<E
z6G~0(=V?FKN`Ajg$K$%fG^qv!o+iT|PQ(Cj%E=Oe6~Uk#8+FsMc4Yx-5LDmqNnXT%
z0C*t~b8!Flfy}<U@BuKvcHw~@zuhTLH;D@M{wddp$Ea|E;|_tGGhPFE*zS0ehXv-i
z|21&-n-bu<!tkIF=+S?HLZDA92LR9`d}W2Byuyn1k#^{ZTxxk@MQC#rZg!ahgRxn!
z$^jYG8MySc7IS5~(irxyYXS+QA!oUlw#6eo;GX>9Bw5iBrHn=v_&XcB<;YzD-P!=`
zs2{ld5Kc`W(Y*I@{KX8xLq(R5bHMMP??$XPQoXQ~dpVS%f)mx{vXiK|x!hr-|Ler*
zNupPF$3+mD2|!Jtyw=>og9lgeFejZ3vL|@@0qiM@R%0kbxPNsuaZZ#&VCPK<11X>i
z@Ly2R_9`z&>gc;G-pOQcc1thTxpU4O7ZFr;%P|$cm{c;vd)Lq@YnG(Z2I*b!U7w)W
zvht?EBO@9^xz}p|848&KXAZib*Xd%Tkoo~r7J2jZ+ZAO(KD2KB%I`rKS8eqvGkTmb
zvS}}kje3R5cL>Ad-IGcOmIv<qRlnq744dIy;o}UKk-iRt;X!kSqWHfNL<kIj%Sp6T
zpf?V}fJ=DM?z;36^*h&3G`qtsq0jLEAKq6N&xg#8#E}A4jqFlJsA6=Q=lVoWnMv5j
zi8yPW#gL-}h7^0c6kICuMZkJ`8WFPSk}x^i{h(^)wNWhd{p*bN$XpEAxYHG{y$4B!
z&MxF#>5BXLVo7Z1&oAKD|I<DK>gL_$1{xrFZUaDTk8~LwT~)Z&v;ga(B7+hwZ7`Q$
z5S-S0{e)%zyKdX5Oyu&_;@aW_oZ*p&Z}rGLJ;mZpIr4kygHU`BTv^2g8{_hSNcCvW
z6zB%qZ1CB+%WgosgYhF8;d&&vcu-&6I$>Nlh8)S$s(<ip{veAj(LW;Uf*h;E;OZ_I
zueb|V>4DzCCR}`XCmJ}F0QD#yLqWtqk)w5Yk@|~%3Jcz^;SMm4(;I!Q?t(I7f@x7p
zFP7#@x0{Ey2LVJfwx-VDUFq<4KDxl~wb3u8XYaWS&$b-nYXSDfB(N_qCQaStJGHwV
zCUXD_7%X*u9ZQ}a^LMERL5P9#?!%Va?C}U6r*)th^z}v`45V4G6OL@||F#}Zfcwfh
zXVY?)M({|tHsd5uQ|zE$4}s0P+T3lI1?pwgL8qMlnK<DqjwiFq#hN*sZEKyNm@Tf<
zxv-38pI_PqA5^i0Q9*AI9Syz~g|iw`2S%WM!tNF`#aE|XjMY(h83J?<{h+({BQS#j
z0Dw2&NqbukpY!s`*0o%?S&(UI{q~y(Nm->X>RbOn|BPV(Byzjqok8;^mb(RCkup9|
zCg|KTcaw<l)Go4~Lh2T)Xg-J7Rb+<V`FPtJTXmY3daIidfG5%J?E8R$gDhz0Ym06)
zS@H8d3m;j(V>Qpq(Qc<551w^EhmAaX@Zb&N+UG*L_hAww1B#dOk{1}TguyPxj29kh
z&CDY?Gj_Yal|8-%wxaDV4p+Y(9v%j~@D9i6TUlczt_|A{ZfS+BciPMa1GWWP>d}Sf
z18VN(@h&k#ZqCp7Lpoqh=`{b<{N6cAO#nW91cET&K~Xm3fcZE*%aUQOLp7`7J*obb
zS#rKlzh`c>+!kX*v8}b;wRh%3VphO-`ow>H7!X3uwt`kv)|zw<MAdiqiST2Kl`}e_
z5>U2k0UN91qbOR~G=?<@YzP-=q$nx!NTzl8H%#-}D&#9Dia)7I&|G83iikgf9pxFG
z=q7V)`0aPozNi9jdmGX6d^Q>PM;BIesUy+Q<q*)@q%zPj)`jjluaURR!Pdgps3Ded
z7vG%wCv4)c_bj<40hT5mO9L2(7D2_EDe`=i>U2|4bZ?s$=_pvw*V(fR5nez?m)ENg
z#HQz9cBPESc^SkaFV6svp_KDemsM6~@)svUbse0=(}2z2%M8Kz{5@tO{5@;4`?0cZ
z37~0qi#W_}6EQ#wnZs<mPxk%`3`2bKUtt)-X@zE^IjsgtPf7_*Iw7@?{;_xt#gDGn
zhh!rz5O%T3w7R*&dD6k?>ZI@NjIsPib9Rq^0go6o(wH{5Uz&<wm>a19D8|%)F*oSJ
zXO{fne%ZqPf_$byK`)9Qisv2sMdYW>%H?6Qco2{mUx7voF(ocT3&J;}<Z<)wY6k!q
zlMVNaSKvq;&YKc$Y}i>W@-tY%cxkdaD1F-)2NoQM_f)y7ze~(;KiT}DS^KAb$JT8j
zK&|s%_w2165V+W*-+HDBD0;KuZBn{h&+PJTn&38GV6MPIEKs8!_x05G9@CMm8uu1e
zw1@n*kKX6<L#!U;9-fb3rUzzNr@4F|;2)vP4XQ~lQ@>1o@rJF<hjO^ZXASF#>uul7
zaq(o^2s=VT*}FGE@cw$iAR*L{pHj7n&2nCPsrrdO8DO}SjItttni&tw6llt0%LId=
zS|hZ=D-mW6kIuK%dqGngz(h!=43VH0HMs-tIv%2IvGe@lD1TA--55&JTa}3vXbY;~
zz{v6poxTUR3NWR-)At~_jc`(b7Zk(Qr(w2k04qmeVl^zvn`w5G6Ac9&4a-Wvszq5R
zvYNf~aKiA&ro?>o$IQH!K2Za-kGIG?2!uwyCM+N<+^v4V*g#%(w@_q4I^TCbBmU;-
zn4C~|N`(8!h%eI%+n*Vj+z{vx7tfD9EoeEjQPzlXXL4a$GdjHj2~^_2p$8WI$B{I0
zOl0KHe7@ZevgufFdAh!7Bd8HGq)z*w{&vzs#;VAvuU;e*D&!#N;<3G{^~)C3_pRQd
zKP;>IV%kUZA-;JoCZPSPL--y=u+P|!%A~$AGePo-<Fgv*2aZW5eZa*n9uO?_2^Fos
zpS{d=TI8T6;51u}57=9`5Wy?TX%;F;sLKO{+1Ic6FLWLs<eMIoGl5FsPR!~&`CwhO
zkuTxCk)te2ud>GWX6|welspnl3>s#O1_+S8aA?P!v_N!!j&?nXR&Zl4U-?J<6ujx+
zj$SJ=ivmU)L}vingnY%*D=45sA!}C6Cysd*<?Nr$V8sbX_QIj;029ecHnEe}DOA;t
zPtDE-B+JqFgdMsUVf%%>fl>ImALj#g;-iM2^=~A=uNznzcl`wZYR&%R(O1DB4PuDe
z;<+0L3T_Y274kX=j0S*<*z*D3;B;FA$lieUCK@@7LYUkDViD3&Gkm1fqzxwBA>?>4
zTP`v?a<ZN8;k&H0>a5RtN9n60YsrltF_##q{mcZQdV%qEh^~SZw%RILKU>T5Gan(>
zy?H{=K~uzT{5Ij$=5^Di$t5>8w*y7!!rw*l?(-#pG6hK#fr+Yj51q*k8+y#$k3FU3
zCkDc`$&MVUJFao`P+n%TGOIwT-2u;H<th!}^O6-K+L~KQeUYsUmU=Yp;MQH=|45tg
z9Ah($`OSDEA}D%DaPG3?6h^Z`1mhbnEqS@q>;_ssP5$^hDr_+@Nif#jF&4&gsrc&W
z=&&neH$ev1ubJdfMMkgk^;x-3T#U8RnK*lNU4WPlb05*DgAzEF3!(;Te+Y0jkB^m=
z<lJU<2-zT#xrWF<DLnv+FHc4ieU9qLa(G&Te)V6#h1lzutDPyrT@5$JLf*_sUy@g>
zhh7W0&xi)_G=|oL2*Wet)vH$m3$qm&;ikZc0t=dZzoqD*i?xA0+U*w+T^#Thh+Fv5
zX6zVq@2ywO?4$YjoQ30oP_^X`&O!v)!L(2TSs`#mpS7X4fYT%prv^nw^#$O*-*sxf
zhRlAtCu7|Ye7`-x3=D{eIxnw+A=BiC1rGaYnNcA(>}pAO^Ro6O>dx{GphF1WgR)TI
z+t!2q;p+`c{n?u^T>n_?F{3&J)Lcm3mjB8(P={J(-l)J3qZ)dGb%P|>rb1*FX*Xxf
zWV1Iv5yg)~Uf5{^voQNgp1Zkt?uz^c$e?(NkxEW|Xtm6WJ^88r^j14Xl@Rg8o=}TN
zl$Pb6j7u~5$5`~av~0JI@xv}H6wa9J0S#WtGiXro+h({!EU<Vjk}<HP4v9ae8o0!4
zXD7jZefx(?f`R(OoMEPB$<?MVN#>Zul{4pckYD%>#Crg<Qq6Q4cng%JzKl+zB3F>k
z@m`b6P*JiXfj+c-YRLU-nOo>llEd2NpDrcCfb`h?<&|$CiqTuWR^AczOSwc}y2}gK
z9zf-tRQGTLzO!eR_fsAUPL0RaUfy1I?at5gykP_i59}*uJBpgy{#%}(W^7DJ`Fp9C
z&5FaS8WI>FHcgx0pvM+>fTfrLW-<U~?iKw){KbDdsbP1^2|#(SmR%ZnyFR22kQjLn
z({MOx+)Ts5+gS!4j}UXDEd_1oeB0B7zSbbHf}*8#a7(RD?N_wyw2ANL|LCjh)yU??
z9&EBp@nq#Z*VIN*y2UiAGPyuOEDe##ei5%z(fTN)GZPd>gy(iu_F4~K$*_|rU*(T4
z(|Re8QS2up*%K8e-sO-{?wl1p;&Yg?eiU>6mNea>9CfOv3@i4RCauKIvgkIe#A*Vx
zzfE%evdLz|^8|sY?!)Csuw8wJyfLgNg@RJcMfR#<*6r=)0X`QZ%W}&<s^;x+3Gi+|
z-*Zxhu1!LQZ9)P;_PRq(p`TSW+WciC(=uY-a7oaYYOKC(pK*BFbDW8D-V(9lijbU)
zrqB133H03N4n3}e+rB5s{D>Qnm`K?I@LOd<ei-q}Bc?;x^ukMhVI_P4Wi0vFZ&*TR
zJQD1_dZ*g&Bz)#SYWG#J@tElfOHaAA2aNq?p7ms1x<a|PR>^TO^%f%q`=ngj>aW%G
z;Wh6w;%8r9y2EmXL0fQvqG+x9fy@JM7h>C+@vE}ITj`krjyhxbu7BWF@AvtxBU4bF
zlz_lkz@eYXs^~@^x9P8%*Su(&h@hPPBT@SqCs@LsZ&DEb%*>v*VI1k?&vm_->sHIx
zrCIZcyWE{?KYuUr!4*XNqMfxYeO~ExbIctEZ7Rd1Z#<IW20%G!SrdQRl(kvn<GJrj
znzM{%z6LvE4B5x4)A(QOr;(tL?(+Z>RRE=M6?8LHAX5={S&1J_p0o0kbtoj@p+H{n
zprK9rg!xHdDP}}qaSIh`D!gN8CD}72!VLI;)BRgY$xz!%VP^afYCyre4*~M?Cz&Fz
z>Y3wbp}p#(zAzt6=K!bZuNkQD(Iz@xv&bHDivLhAK?u<&iUvK1+y8)>nHd=Ia0m>Z
zDgAsArUEUo8|UMTN{l<%Dgf6G>fJa|3s+acp^Ni1k;N9}%(rc=xXsi!--Nbh{}b;~
zdW`xE-m}{Vp51pI8=gp=M*28D5s_zpvhv6gZAs+qcUQ9K$>qkP5@O}a49gC^FV8O6
z<f6_4gn%?Iq(kVd(u7>!htzqfMrV2uWh1+IyoVNcgkdN6FDOzVf?mWo?Nf}f;ACy}
zf$f-q{2u`&{muo5qW+&Ms@>Le9GGx?nMO;G9CpI<POn-woT-U{x{<?KNVkJYs_YEl
zXl7;jQf47VbCpdaAn56#M{kL=;*RUsa4z4D(rNNNCJ(J3`$1H0F!ZzvOK)JIzy!Zn
z`e$#31(|*^NHLIxj_BJl9NKoch#bhEUGSf@c<J(Oq`JDg4mtc>9?c)+=MZN=J5Qc&
zL-}gR$t_OtaghTZOR<od(fzhKcMMmU6BZGW)1eQ&Gl6vtUf(^`b?^i6d+LwR1ZLvT
zwNDp5)(&i6Tzt18x%My;WpjvMotQzOtbqxY6plzFrmn}?J%(N|r1PcNVs58-Nj~9`
z1G+%rg$SmZx*)xXyXG@WzB;yEe`caKk7eywY+DODS?t=6cxD(Zr%^ZKTPByX5AoX_
z54dAA+ze%?Mg)8_N?$+aI5wwVB`b!Nt<>^rtDiT*4*Y}pUtJ#Yp}dJ%!wC_+W-W{V
zdBn8e;DpTuWiK-CXE0itUXx)p;^B3H$zdkDd=Oo1fxYfkr++=}GcUa=kAkw7FZ4at
z4VXov5{%CuGI)CBPV1O(D3_cP=wi-dc+?c8y(O9jiccbre|gf6of>Z^>$Drk%hcCM
za#ID$$_j)91fT@VTU|NR_(Nu+5?O)eO2WDYw$%s<q}5<-jb&1;#pO~JmWOXep7d5B
z@#kJI`$r=*TXtF8>1HqqQ8kg5wKHNd^)LS~to=I*K7;LVx~be&?=Zf$rB3Zuy7c)=
z4K_1|t5=I^YV4EZUD^OFGJ^b=T;Ii_k3xiZ4K1IUq*ZjejQ)CB2MgSge53S9_3fbx
zhi$yG2Djg^F}$@^QTW7~tenXw5+~_&{Ov1awHWA($zgHz8AbVsSRn+_V-g-l^Y`C1
z9bGJ$Ape@%T*qPBsDB;WiwcX0x^H;Kp(E-PWhpBulr~4D=4^Aq4juHnWFP!d62h_A
z5H`fAzrj~(dU?z+dV;^VK8gM(0i9d9y}uK)S>>sGs_6MKNr#l^CaH^X@<D8Iy9?Ug
zQ)5f(P*n>zWOh&bB?xm=SL(c*3tDUP*&O;5u0)KRbxaq!LYWXS+()uqN><5M<;3|a
z*L^;5$Im*oIlKmti?@O$^X|t&p7-M$C<X#0CKf|54IG8nMT$Rr<s&sD`lKjuwTzFr
zs`ejocUUg{U^QKl`~61Mt4fy!-<4c=3q%At-=VJA1(&4xO=np7?!{}W*G<V-sYxJi
zA)8Yz&YmA<D2Nmnju{r{MK3#&d~3;?8w)ZhPNYYl{+x4qCZ1MfU2O<Ti3?hfcyBok
zRB-OB*~m7Sj6Q-f73$yS`xEJi(jvC{Jxdo4ke+bG%EWm{gk5rRp1WUTYo!@-bqIw0
zAYCXigb`&P8`a}(4FD|ZMQM&$uKUEh&UFobjqm8aYrgYE?FBb8A$N8!i+AaYj2b&)
zJl52k1AI)cTKk_asJql#^U?d;DIB7LXR->Kp>X(m40F8e^&z?a^U`Bz(B=NLRP5$Q
z)r#aHzS50p;a%mEcj8Wz2Q`<W^sdvM4r$Kln|sOO<CqvqRl818_}NNB1S3uc1_`(T
z7&`Nv(9^{iZkg^VF?@oZW+poluf1R>%J8>h8v@Kez*Y$qdqBbHUf^bfwb)uhvL_vO
z4aJF(deh#h8beV@=GXa$*UchF4ytz~sa~${D*-<=&TSRPs5O?{#h0-#`}%;LGE3U@
zElHk|ITv#f_JuOgvwFQ@AM@!9)1~}z8;tQwL=A%?{N~j&wC|jA-qk4pY^p+@`1Tm}
zF<e)W;!_*3q!O!XcDkx>*b*=Zwff{>ABRys^(Ke<!ht)&PKc?*CZbPHULLXXGB|`Z
zjBh<(E-f$)_~ruzE2L8xGcOhNvZc{x2aOJNtsiQ$2M<4Q$2aS4{#W}jdm#!^Hay6X
zk#R3I2NWv|RIkQ-MNHnXVGakxE(?A=AC86;>=1uO83$`wGmWv37?Y!8d2KeTWSKrD
zLh2KiDQB&ZOnc|=_!YZfqKvb^%Dz6q1r~WwZEbBno0OhG-b0`aA&<X<)wzc8>#WO(
zhi#^5KO{-E)k|GynBp}<-ayF%YjU=Rw<;_mekgu9`DcsJ91RP+C4%93KDTp36jA4j
zN9fuQUrY-P)ggQcv{=~M(a07*a+IE*&tGy1`Id9dBc+&3=?N4FC3?;69UUWGWICZD
zw=7S>Ccp+nMQErHVUGPWNkG^|QK*Njyw}QsTt*`^k~jzDr=Xf0wTL-VN4@wBc54Vr
z0YyI4?b`s1E(@kGLvwRk;%i7<1qMHdi)ejB{&MQnDZPGHf*aB=<xYqwte-se->;a5
zN~o`|Z$l16WH0kaZn7;{A;4t=OCqHVFrg6yorFt!%}`tWUp0$KJb!3L;5iZNG!ext
z9~&gx665gYtZp24-)VwdF&3poYjbD{{;D0QszAnlP}}fd`|cs!C~Ts`0-nF>=Gu@K
z__m7>OC+0m{@8Td?$+Ih)J_`vd?5voKX8FatqkP&M_1;*`I&jnG}N^ZnaM60P;z(Q
zoXEB01Z)4=vXu^CN4uPiV(=h^y*&*dm0kzQZMz$*cA{ibFKBk>4AwuEd8{)Ou=`gH
z?(R@Dw^TV!L<oGDzlP8O=2ECc{0f@YAC%JPt@g=O-SeoNjB^3$u`CzUW%un_T~daf
zncSVPDA5sldzEA*MK)Sef=IH7#;A|NjL4ZACS0<_JK>SP5S$dpmofO~S6@CHMSP}$
zPCqfTIibS4+k?B?U??{z79#L07Jy2>OG|jm`T^~Uln9*-#N@or8F8X7fa8kTB$Vm$
zL+!F5)nAa~<d?^4d$0aS8G!$1+LHhGto~=&#Q*ns{O|LC!E0~iqU3-W-wtYAp8Q!<
zY+;rQ`r_e1eyADAv5?;l6U4dVy+k*B;J@tL{_7KaIPSfX(o;PUDF=ld|2t^@?>+ec
zp&hL4mN-<^*r=>-Pr6(c7H~@RmS&&Ty=TgL_ZTJh(>3W2n3<Ogl>@p>TW`K^OqaY<
z9Tx5+VS902i#NbfVJ;AL(Na~m;kdFm_qqL=58hnOWH0{yG}^toD+ObB{4m+k!~@@Y
z1*P(f)Rp`X$0Uawdbm=P_mU!Qw=pU%jt?iapr+W_RVgH7)cot0tjB~)Gd^~xzWygf
z<bTqud{zq{0s@75ILlSQuZ&DWZO07<k5z}c;<PP1a%OvVunMfK!rRk#v6eDKS{06M
z)~7KIlsK+qZQ7c}=6SdvlBw0rv+ipNqr6qy*s3Nc_e-E>=fxb^TGriN-dw2K9e#G!
zvafL3m(Fr+j9J*E{Ek7b(?GxO)@R+VropYiQQjDCi|Ck`0$eP~kVRYnbVu5Fi^qC|
zhx*zMomb79Mf5%b!x|{LQ4@ne4?Jma_{%W-8O4VG=JXH{{KKof#|H?`r46nH4ni1A
z@u>xPB3kQ7R21qK<OFXX>|$+&59<h=-(AO1)v|FP<SN(_&fQwtU2PeeG_%^m?b=Lw
z6l2eHe$n05-CZRWUK1V@hRfJ#abGDof%W1yDZzRyt9fv>l_T&P>Y?YXIcx2^i}B`w
zz*0zFY>dxjz_IV9801|`k9tPFmAm_W>w5C0)y}BZc8y0|3G&UKZ5PrRa9dkjb0ve8
zc|z%-!t0^JQNOgWBIVVP<K3U2ly@%>-KuQ8^EZdnMm2Dkf=u6(i;Fk}de49GjuqJU
zU3<9MY+<kYbfe61p1#()*u1LQUQ|JJc6RpVC>tBweal1XaGDQH$%S3@essH#pA~vo
z&d~MFaaJgxWZ?V<>LYg?qBJQs6U)tc4sf9?N4L}*=A3>Byy~?iHl3;wcqQmKdGu%(
zA&~}%)3uPJ5J$;N{wFDi9zlFWhA8I>J}f_c@A}$e#T=8$VVr?wO-xexRff%#!E&qJ
z%3{qY&d$p?V0REM7ruLU_2dz_TUBBge?wFr(Ej;-|5vY+^WicA<uobwE~@ogVNssR
ztn;@L{9S6ye&*JuT}=1%6u<lTJhPnl!<pqP-;m1+Zz{Lyt=JqXbAXGsD|r8Y!1%_g
zK;JpvYU@8|vDutL96Slb&k2WC&p5Z*pMN{r$$FRfWx-HWucahctM<f+3cfb&Tf-}J
zRSuhiccLTxp2hP`@ivFg^}xNGF?{%NxHCf;u3VQ9c>vPH3VV<3)M1=YeeZkQm^r35
zaek9m`AA7gi(@=>^+5fMKmPe{%v7&#t7d+7zJdR@H;F*}IDX<z+suq9<$+?ly~W(s
z@Tkn>CAX@F;8)Dxd68H*rrL+29f(-i>}@*Zyf7g|c_(;b^M|`JlJjC6IF^^UzLO^U
zy4fjCEzY}ntCeLYy}GkbNpAV`u{*(Y_Q;Usnejb9b$K82BTjaf+j)jplFdr6UNsM{
zp7eegS$ew!dz|i<v(WNt&g}C+o*20~rY1w>0qM=q(xf%HF~}|)`S=cJX~d)1?^1#d
zT#)|GusQKmazn6X7PKJp62-TPpPnOrDxY><$8Kbq_D*nrzRJYpWJGxQk+W;4tnBPw
z%YDz8b-2*-%*0PEOIUThMm~=?omsehvTG!J^#-?vkn1n+IbRjgM;+Z?QnP0t4%&;p
zeKLQ&<jl$)2Yka_%d5>P+wjeu?|W3vke;B)z3|>(X+B_bAP+~W5?LqOzK`UdxZzX(
z=&~UBV|fuP4VEPWH#jtPbh<6uXP2*Aj$<rydt?OPyHv)_Vsd6r6_>#n=`6JV+>|@U
ztEO6?;(A;t&ykM`hQUwQ(i+bRwkTUEap%kyV=a2@WE@8n%GJ8&?7N!Ui?8uL(&HB^
zO11qz?7d}JRZ-VAiXDW~DIhIKOP6$)ly15kq#G0@Hr?IbNXI5rx@!ZnDd`3QDXB9z
zKJWXU?|j$woqy-o*?&M>u;!X;&N1#W?t6@R-fK}mv&2ES@Y9!kq}2}j;y$sBKaC!1
zC@;}&H+X8|rNE0R)$4;Hzd_PH{qtv}P@ujUL1klk$=uZ0hfqBH`~xrDvJ^1HA+zeB
zm+%xlszH<S-?0f&e|$<N=!aS6lc+QNI}+R~recU))A1EAKZ~L6&HD14^0)wtP^`cD
zO{d!qkH4w^@vZfKyI41D+hTjeRX?m!0A2EOTd)pawwm@%+YQLXsBxqi1m+2c=-xzW
zpkq#SyHH@hwq1)(MV1_C0-rx7*N!uc=Cr&^VC`HPHkY1}kznj-X=zDwKuxmDOdYV7
zJ8Cwvizlv3rNcdF!oKUD-<qvirv6e<z+(S-<2SmE@~D6NXG~76=Ok^WZhzJUhFe&`
ze6wg>mx7|VWa83biJ^=Got4SerlfZCRToGwG)Ce1Iz%-#lrZvSl|&k1BK#cZry-hq
ztt<_E`ndSe{HgCJf5F>LvP_;!{C#Vyn0?+=V%Jxw``tZ*uVvDNY5NxzmFbH=-^UST
z^KF$W1?A;Y&&VfgTg=8^iKr+(2z`FrrYh7gawxV?hS;HtmGR`MuO9r>y7+UJ8f5p7
ziH}xRB$Q_o(G@Y+tY-_^vv$Otx5Cm-vyV4JaiT|_K>TA1huEnoKV|7h*3H;qk+l<2
zEANcmF;M<>*X6OOg^kTS>0+Ch98SF3k$#S}38D*rDhFOUa4Qhjv~z3m&%-0o;g!@=
zkv%rv&xMbBGQ0LVrjb`8Z&DQ$M*%LsUzRxZF(~!*?<5n#v`B5ml?!hmtE+{e`|n@m
zU|fY{OYO+)iS0PUF~39F;WS}247+PQCoIa@F!)+6gfT`ifeTI=(!|xS-AfEQiSW_~
zQ-Dq)J0~3(E=Lck1S!rxLdxP7z3;Q`QnI`Zj7Wdqe4Kv#)%HJ*!!d!Usejh}C;pr<
za&s+rlU{6<3v@>am>~c8XrKS^SDGYKx<@x~L@3H=xjJ{k-1CxL2?fWD;bPE7g#ul(
z_;5p=lx}98Zbf36b(tA-I%aFesk^7MHuV$I6~IKm=1AQc8jZwGOdy2XFULr|al}hX
zN|yGL2lLHRT+I$|^a+yT2fhAgu(|TbtW%Sd9kz%5xm$g8zNMmXJ~)V4&029zjJ6(r
zVE0tQ&US)1G=5@0+8pqwAuhe$xC>cY1FFwq5KYbgJ?CQ%iiiQwUPC2?>LgQeA1Fm;
zc1@ajvB;uMKKhaaqj|3LgNB)!@qR>h&r;>*L(Pc4+xkA=cbQ!s>So^Ep|Iuslez1x
zdLQg7u%L$;$8CX|7;c;bkIzMs4qJX^U7sT{RvwWW5r}diEBng3)}U(m45RZnVbR#O
zOS?x_Bt)ZhYt^C|?7%lEgR85??Q3~Xt&6p85JQqxil*P+OyyL^s8h5=Xw7QPMaOL#
zpNoZ8H9TqIHckS#OP}Dxy~P<eXH);k?=vs?)KT-!M+jUKI@P7eVx~K}eQd-GP3>&f
zCoE8A3=6ibg;1;$p*;kfd1@OfnJZ{4N(Nu)v3b_6WSx~fZ!t%WI*6sX+PYtGEdBgG
zp=7>iSmAIhH;P=oJ@znc!{Xrm%2D}V7B~Vd<L{0njt@7Fy4f8Xqwi^&JYP>8n4Y9X
z#ohDJz}TT4c_Zc$Cgzy1tDYI~A5*jM@#zaj=fz>`Ol$Lg9pg*|fHQy1>SsjPqwPrw
zrnZx;Od5tmEhl!inb!~V%3sqFYT>m>YHE~P$o!%UQ)MpnBWE#Bch+Wh&eKlhONvm{
z+Un~OaZacUj2a7gMV>3mviW5QI+2f_W~&oB%8@dd1x?|6KTO5cD=G2vKpogp6A8lQ
zN7XRMRoEhf<lfhr-7dvP_9-<f1)yO_yxrITY8aC88KGUiJv2Preeu09RmvHr?KU*V
z+0V^pEYuw#T~@(H8nny}UC1_)teK9G%JoKM*h@}OzrvTaCgJ^h`kXbG6#x%SWkSb<
z#s{|H0I69mfJ3dXJ?fE$4(t)6q0_x?pRuaXwyBYwV&&?ef3G!lUzogH4ZFHop&-UC
z=@ljIZt#TD9NUWH0Vbp>s(hP?nYqM=2*F=_S3z+8%sVw!grs&}3FaBDTOLneK$$x(
z|1t}^W|gw48pY1F^I3PpW+`(~oqCFaq7uV3TbEE@f(k?<qN%3~m`2wnqX|2HU0GCg
zuZ3YI=Lv72tKR-GEM8#`&<nrcY3|Iva;X$QKp!?2&j;s}EfJ1+^cG@ndSWT5kdjz=
z+bZM%1=t@04~0jzk2^AqR86lt6PCFlElv8Lka^|I;I54JInRv4zx~p~hhyc;aGY1(
zN%5{~9ILXp*7lyeqn%{7U36S!^DL|mO6DyuB`Ih)U)u$)ez+GIxn~H>55E$PfS7{p
zEL#=I5jAD%7>Op0d|YM7c>)&Au#^FR?uj$RjLtjb>aPZ6){ZUDQ(729j%_WM-&y?~
z1lL2q@8ov2G@xNbTfRWqRt<Eznzh)2C231Ru9f`eRd!QNccO4e03ki{&;UJ8NZ1dl
z^GxSFAxBbLQwT1<FP&SjUC9~q^3hI7LKqFc3qoycB%98yO|(Agu1He^l<YKy!QD}N
ziUl=t`6(?Nive!Mw>+C0FOwwaouvNFxa$3vvzL6FL_~{<;Tz>U?RvRy=yO=cFZE5R
z;>;CGc$IiP))5Y@e>xMD6S%N;7rWbBmCg4(ONnmjlk#b+<Whp9#8PS<2Xyqf{14x;
zZJjUOS*}w$-l6UOW$>ORXrr4f|FIOt+RGrzCJ_u~Y^i;ThL?Mq3N@0MKW6F(UNd8T
z@Q#&#w4;dm(~g!Xj_7J(AQ9Aal#|VzHmay4bX9+3)3HjkaNdHKcC}{uK2`>^lWYv#
z$J~P<akb+9cFlR)C24XGC>GN7B&J4xkx`H#Oj|tmxP;hm;Hra#!IPF4<+;(Kx>3A~
zwBYztPQd;+FETQX(y|I^!{&4}HDgZWTxWyu)7=QQtEW?k4iZLRw(dVeFE04tW+~!~
zkird#No*sJ;=kcckq?>hj6V4$Ff6Mj<-tN=#zk@*W@K(IEs&l=m#k0(shCn3zPCj6
zwWfxOw80hUpu)>S9rVBjD-ErKyE#-ibE3AgGD~NPkZE?xrk<Vj`tY!7QUkZ?9c8X;
zrgjEj3^>#ijl!Fm;&CE`;0;F9VDG9)6&I6QbXmaZD*t$|oP)V}5gH>-wRiC=wdeN@
zro%cWAk;C))*?|AY2Rt-)5pypm7R4-q@iVFRjB-|B|XuN_?zLd{{GiQ$6^jGNp}_u
zZB&~QbFTAl1>G!g0;x0H``kmVSEmRY)UCDi{R~Hv^#JQBYb3_1`{t`tL96i^H$i(#
z7rnad31Pu~Q)_GamLH6{$&#;K7^VVuonc$rV#=y@$uDu+wPWP1m30_Dj=n=Vkg?=U
zY4n3`8tJ$0*FzY2%n^Dli5ou}u+^fcDaK>ht+FQxWQCu*rcar@3Eq-`sWkUe8%$IK
z(2m}&6xufV^j=aVhUg3LvyQ$|><US*8T6unxExyb@bs^9!F`!9WZcw*ifs>K!JKaI
zkIu{JGw8zT0zQ{3N8O4Dn+H|^$QsZTg7Gs8crHW?_rEb-?!Z7;hsdz3(EIv*f1A%_
z@PlIbNanU#i7ZF$f0~svkFn95MWh_Qm$HlPcP*dVj>{UdR9z(;$jDEnjdF&iabEjm
zhm_03a5wfJ!MHm^T*R#PZDWd_e|k`yDpC0hOzv&yt1klF$y}?$%;3=sGH9(a=HqZY
zBj89c8TqKd0C!bj_(4Z3()4<~ce6%SHeRL|k{i|MutYe$@>F*s$1~M+$o(LhSEe%c
za~eNJfBYoWoCN!*i2QMt>{g!&ZsIdz$DYNJ&n*QyZoZh#0SYaL9Yq@H<kGBSAW;6j
zCUzTOn1P^M$o<^BtH}V1)O+n$`!ESrfgg5BI%@mZ;99nTuIBrb#p`0u$^mZk<Il&?
zS<_M#G-cj(SQu1Jkw6<Je?1X@Y2U<T7aLJs@^00@dHaj}&omoyWeL@U1`|7*yyQP&
zw@u<v#&a=@v461SYLSQSUu^xFJg5!<j*i&oDNh<_rjxZbV=#ps)v6SAO-q=r2erce
zEnE1L<D(a#VF^)J(1D3ukiK!o{)M^)%Am^Bt7qJ}%4v8EkAyQ%qT?mk8$#?L@5yUz
zyOTI>ce$LHa_(S(dh{2Y)X^N8`azeuvb>}?sSOelUeTDL%?{tbts!GV?eK)uIxf@Z
zuM(+`$9&snD<o=LRq>vgXliSxDXA4t_HMbS5l&7{=BwXJ%z+sV@jU@pqV}bravDMw
z!Os2Xb78^q;XrlfiM86$G19mc%W5T-w7pU+d;5866N>M}Qx2PGqR`K6yk?}xW#`yT
zRp2XiNEm$rvB*?o4>NLdQtN+;N@!CgP?O~+C;pnPaWov!sJFq+pmU@`d|ol9Trl42
zEz(KonO&sbr9z|L9Dk!KdoyOD_eMFovK#@Ue1c3ne3A}Hh>l&CF-m3Ni1}m-=PB}B
zn8IrEJXTbHJ{G2z%>yi0xJF)>1A+O3T>AQI&t^wXe^9plltq~89<4zX)LS_-O}&FX
zWzukoLumB}QyGkZSXNdxhYl+)F=psFdw5}}sSE?rpy2CO7n;@cWT`ach@xBaTMiLR
z%SNQO6bHE!VXh=sfSG~X^#NJ`Y&!ZIu_MCAn-c7!sR|*H-Obli#Yw3H?`aj}Q1I^y
z^QTYga2`zQ;E0DwP+Hd?uAe<esF&o`vIJAGP8oV$7<7uZlRp-VD6&5B(haxDIK0by
z5V)}ESVI~!$B~9M`VsUm`8F9|xRcg1Uea4F<UiWI<^Brf*r*y`(i;ivkO{gQ2<*&&
z>;{RTyYC*G<bUJ0p)~Jh5K@Pec?=+vOT#`lD@O}LZU?h`b;R$u0?0OLYe;jlvN~DS
zLHm<c*rnWSRhe0}E<HBuL-&{x5!kQ^C-9oAJXOnCz_x}-7+z}l<NXcwxgs0hT&h%{
zND4{1?{5*7-t$##BBd$pXC}yFK;;aWCN~|4!X$1(n=py?BYw<r9L@CPGow!I_uY29
zY2oW`Vg29>w~#{H7y4@}UYUp|Rj}m2wGp&Yma#r=A7d(bRUo!zwqRsvTF*j)zMv|m
zK!|DE%x-)^9m$0K*&;TBq+&BEW)8sz9(RM)WmXx-HQ$?_>N4$94o>U1n_B0&b)m!3
zGKqM}W>}<&z|OdKVkGG5KqheFS50htn@l=irp*xY9&=_e@y849q~&I~cU(OA)mwzk
zlY7nh`9VnQne}o)JXAx?ywedHp&Z&o%xj|FvJD-Y7i|6TZ1Yfn3tB=Qs#*{)VbUcY
zCk!=`B-W$zV^OA|fli=^;ZWzdE6PbMEhdLniUr_Cuyj0B*bz@m41r1%E6Uf;$cLl`
zGp5T}M$qV*<W9`4nYkG#;3rhZ3NO;;)YyGLyr2C3PJuwAs{$=5Pnv#9nPB!4FS6`-
zhu{;r#vrUu%iN<$sNiYc24Q@U+VGp5C9YI{Ba()`?}JCz*b(>}!CxZr8Q}x2vhNQa
z47P}qqJ`3YCY%zwSc~b-XkFVPUh#49+9Xya^+7npl&BN_?x%Tf(bikddmR>KthAP0
z?nktFzv=dtp*z^p)L9NfjfpBB!LB%7nlcDT$@`$K9XG*ZelJ3zn3}CwOx{wzSwzYb
z>fByqIaAX=@9GrWAje*$6jj0;+?a)xF&G^?#qrSZ^DgJl#fxhM0&@9#%NcfbbVQi-
z2_(BHQv>&2raZ2G%KSpcquFlI#KFzYZKH+OJ1KKseY-Gpg$^dm-bCTuUendB$v0+$
zZkq^<7}m2o5<5l2{FfSUg=RH_dsqqRV<$;JkCwJk-4f9SBA4W`o)k5GwCEzmB2)fs
zStg-OHJ`Ua$tWtUJFVq3p+@HU`}5hkO2*%nw(GgrNxrU9=#bX$WS4#)PNBA()59DD
z^Na3H(sT8T>k3?tDKOjSNtw?dDmL-i$XI4g!tIEjr!{`v5{mm7kiB=e=j;BRn+sZ<
z=$p&w3tLUAf!Es^UPq67+$ngSc&+cb*sZGwd3z?y6D@=f`uu+8bAx(NJ@t3z^ZvYT
zEzhw_(an5FMB8k{Jl4(Z(YF`%bH|;Y^<gOvww@b-kDi(tV}iJR9i(-8d|LmE$%5wH
z9w+ik8gG+X&_GQz7BklJ)MG*qm5MerE^Ij_W%fE{;v_qsSLK^4+NL`!jDJQ*W1BNW
z7B)vLOR^M$n?ueX=plAAw@%DwjC7T4rmI$W>wS*ZxuN{lYa&fqm?=hBaz_0w(hC-~
zaD0hHG01wyp=}g(1P*iYi|m&42bf{h*RRU-E#Ai1F~k(s%-Imz;N7%5HmixWgRgh~
zu31_Mj$bOWl;BOZi^l43`}UFDZNI<2pO~2V;I~U|YjnRBfgoOYsbTjMI9y7DKi<yZ
z)SWW79jD`nnJ^OUU^O41tOgkvei?c+*wtx%3*(%`)zS#V1Q((RjOE=sPlcs49$r=3
z&L8;mtqSR$ZL&LX`E=Q(!dm~JGuFlU6yFzVD&jeZE2du$p(2<EMVn-=PEUM{#jlTk
zIxtKPYj1N8k@;K|;?495O~+B@I*c*1uEasQ^77m{OtC<eeXo5G?vt<9+XGYZc)S%C
zrJv7AfEJ)b%)7}!M#w=OZl@M$_;D`5jz_45A>hQ60?<Tn-7v@WpN`!va&_g(ie#!x
zWFifw#v2bCr?L)&q>=Ny$Oj23UdF5$pWibs`!kd`A2w6NBW=c*ju$*me+(}lr0Nlj
znLkN$F0o&8JQv~BHzN$zh8!&^CqOJIBwkG?%kWihb=swpXczDHeH4PIClag|`7;NM
zF5-wZF`zC`0*Uv_Y<B!H3HPphMR~*4DXxC`)THNw3(ewJ>tpM6<^t7QLY)m7nI<(~
zhU`)o7Mx@L4!@XpX3V|#o{?TS+n|A?AUpYVc6cqtTk&c=ETcsSbyLUla`iCOV=!yZ
zCV@U~9L@SRuNuGj66a^MC(K^k#q8~ep#qOd{BYo#^J?u*47zmNfAy}dy98y>vc60c
zpMExLmwM)huEW><4(sjU<vB5a6_--U)|U;qd3<!D9mR(ku-?1VHF-{E`%;05S-Zt!
zaebP&!}8$osOIr|nY&;8k?JPZXxMW8e18OnAivPrZ_hNWri}srvp>!UeZquFG6$M>
z3d^sjxguVim!I^XDXF{I6`Z7;m9eOlXTVM-Ty&dv%b9hAz_D-+Q-Xim@15HtSHzO2
z@v`WLWj;k>J=Aaakz9sK73s-Td+V;cYM1*@p>v;Bkl(~ZW6Z(E3Os!$XBvc0NwYG$
z$<^Vryl-JfnD(`OhV+KI)3MeOxK#Y%mYC%LF7K1SB=tlPiG)gr;7xi{pc?h^nsa9O
z#f_|Gmm6FRRD3<}J`5rcE3RB3JlmG_FfQh48*w-DYWS0CLd$RqfgNUW0Z5qNx~bW&
zbC3-`+HfIFK?qoYYfOJQs92)pul?hAqC9!#R?NVUZLbJz7u2eNl&t~zF@wGQ2=E`k
zOYc5-2>765RMIFBFW<LDSYW-y`1ciMpt1>27b^B!abPT2G7rhpZFifPt(`PFOpCCQ
zE9v^a{`VAOPuF>829^)NQ$W&^2eoG#+h@E_TLr-OyZ)u92wt;Y+kfbyn4b>GXRn<^
zZgx`4eMFLD6tBe66s?C^4!VYQd}g$Do8dnDo!z>4LYh|dCHt*r2lL~|TY()Qdx6~t
zm=}E(f}b`Iyl<P=#L7Bzu5;|;3BCDejl-AEBKTA4Onhf)JnC0<NhbN-XShA0A~C8<
zeNG1XT0a(?^l!j7!LlNpDLQJ<b~r|#=5xe%5j<Y*#&#)jTPzq3(vk~#T-z4j&hLJ4
z#KrV{ED4(-+oEkp()Ku~-pHyzq1Cy6ZEl(`^`F_raw!3eGWc<IO_0)srF5dC?Y_>L
zTM-09^#PJSds|z(e*+=lC)C|l9rrjN-HWo7FOc?U*OI@RXa|Ws`JtF`4qU;_&Q<*>
z*`>25u4<z^B4f5QO~~VBx4`?dVVN6PEt8FSHXNA80;!YXZpAVS_+`q`>>9)2vgW^g
zAr-9`OX-jHq%`r5f!#9qNd^N0bR5~=#XH@0U}ZalU+!%JOS_oZ(ueU$`veur%B`BN
zFAw-Hzh}eG1o<-(VgBD^mMW^mhU`dOy5*rIX%>i6A8WIC)3n}ucOs<*zn+j?)+Hwk
zASa5lt}#U@=7iSTgsakc>B{>k;-t%k!6(VN#0uJ9XMhF&Wj=h*rLbsPRER&4pi*1N
z#Tt=Ov-z_l9(oG9Acif$;@ae36Yv#HY`kxYNLKFc3#xKwB8}0<R;d$Ip9n{VE=)Wl
zi|8p|hZI=ZXCQ3B&7D1~JMMBF^mpt!7*0?A(B5jpCo?MUd)&YDDrFYzV}WP0m<z8f
z>E$ok>@L{!rq%VvZuveLs><16&pYVtU$F{XI=(50nH&^a?yU_|;f;5W*CP$wq8B)R
zvPGnG?&72)#i-TF(twXu)j*GyqS{r^>1wtX+Ua=k$NEUV&D2$QNoS{A>E*2R6#jC=
zkmvMdM<ALW-R%~#|6sF#dt+=l<+UZ0!;YU1C(5K)`i$VvSjdd~K~1A8V!*1+?<Jdp
z%cSFgpT?W!tQ&xp;y9Ji8s*hSAkD7?Gd{8@!(n1A$HEHPT-ww`sM;JP!@U2fBfR8w
zmvSzaE6CJt2z?%-bRkb`_!bKcgd(8G+9ef#0#FZlGY)355<>9q^nd4`^#7`5XDbL9
z(t3ZKAp_eLn^C{Ft>0E@ZfQ8KK7;=K%`EVV8T{8Ze^Y82=g07zN8Wv{xl=&V%=bTJ
zY7pNG2OXYDh(B`2m?w=N54WEC3CVja>YGp5?cDVB^%i(%U(;Co*42Q{EgSUKv4wc^
zP(FEB#s3g;(&kPF9Y639%on4YYz44?J6i!co`Ab$6%||n|K|Y4`aebpOQwHQtZ9W~
zUqOy#|2*S*TNUysRvCdZY$(}UqMiX2r^2d#lVr;Tu##Oh@@HLchT%?eWA;r*{x`YA
zwqY67g$CmjqccUbyDV|s`5zY@CE2bH3M{B*@&MOV#JtSx&gOn|W|!M4;WmBVsv`yK
z>`h!;$DOZvX)4d?gpltse`UnaLlNW!)@ON;$rC_+vo1%)0Kf7*a2+PunkTP3C2&~A
z;#%dgqIr<wX?)5!ZlUeZ=;{D+3QK@R1_t?QyfPonyRs}w(ZC@t+O6fi`FdK$p_%%d
zd9s$vKVHvf`L5q<n1eNV`ET3T)wN$eYP$lqK-%gF_E=Is-5HiLd}J|IbUg+8*ba&F
zvg~wD-cJD%`s_FRqk5}0CYXRf%zF=^vEXF~_L`aXe;lN=r;?nM1|y)I3l!3Lowj0;
zyf+Dwg#oEh_1CT)4wUbIuMHi45FauJT>wt5bVF|DE$3g{P#bEa9~hLDfPth{e%pSb
z+u(5X-PTmK-Z<11*M}E79b5VXB-I7j^cfc$2BXrFwLLEnEKToj{Jqc{5PDNi0}oAz
z+Pnu=2Czmwg*N>J2lU{Cp$u)T19eC$t#i7-<b+-7X17D}%-&xE16r_#WG)jzoyt=E
z{U6N8e`8h#=enRMu|i8Es3s@xu!9jj7AF2nl<+>0#`i1m+c_!);(y`=KBx5@c9CuK
zsv2068){3&RWg0aME44lEv3QUoXETOxY}UEY%x~UqpfOw7v}VWzH%V4viuwOtudfC
zS>EY9W5!0^`}E-m9zixdv<dgeUmA<%XcY6CCAUU)f{}+Ty{lGv{AhbUi;T1fTrh!k
zdqKJHe=bmyS}Lxc=p+1G^%B{ZtuPTpROW42p%{_re-D-qI`LFTi^d6$#`E%Q5pNvK
zDRpXJ+h#8rdc4Z%-vxg%s3J0ROVZXD^kR;0H6I_y3v->QYAz0XhMgaZL4b`!7;oc%
z8!NfWKR<sFJ9>w{lFmXxIxT1n%lI|f!JrVLWM15WLLKyZneG^HDc%gNEMu*VHvjKS
zDuIcG!=yqhs}e%#Xj!4@#rEw^pRtfi1(d>2fQVcP=j<<-7{c!KS%F2!4@|m)YJA=x
z*E-W4{5rNSXbY#S>14Q-kU2+%U2Mexn80UOzu$)2f2}!RTbm`W70N5~gp^g5gLtqL
z7zb{ww}Q90rT4!G{5Q3p_lf~|fAFUGA;1U_*q)^o0`aqE!e?c=gPNSJ1K$bVk9O__
z7xYDW;METzi0-z&E?|=aGW7sr=%P&lzAn8Zs{%QXFqzPkrBO{t>giugWpz;?gGr^0
z5b_$F;L-x#8yTJ+BsXL&g*3JP9yfgt=Lke%wwS>=V4|@DTTgQ8*_`7Db~|(GYq)LA
z&=HI&E(=v|r&l-2Hx%C70R6KCEGR%cm8XC={aR=#{=b{0qJI(Jqlz@ln;nwN^r_|<
z{sPP0CdG>|sf}PuawTi%XbQn!-Z5`1*%d6w=(Uu4d*#etqLbgn3lyGdJ8*hC>~c{#
zE=z^ZcZ^I$oy5GSNFZqGT;@NjEEn>*VD5aT=(FyBrTd{Wdc4z69*Z}V?%!<s^3~hx
z?f=IP0pjgud};7tftJZ?i#uFjz4qiU@p6rYT*<Y{fo@mG@)DS6qLjKm$6>ovfUMC3
z`I*U%`#!fYJUU@=CDE2Z<~ASDP_XU>50fRlDDS*iQDeTgWH&!FyxXw<*@NAgDc)zo
zn=p;n`kf*&1Auf~^RwEs=hdeRG+D$dOMPVr=gU=r-@8<bYc?23*ONk>t|D&ce28z1
zo3pL60YF2#FpfziFar2PdZXCJ82RZ$;@XL)o7r@L#10_7XaxZyQ36n_&=Ki4>)4Nb
zDjl{4%=?4RIr$Z&rfmprN4@=*1~`U+7&}+{F(OTNHO_MdYg#mD^@<@FN4xUNAZdfQ
z0^mwZ0-2f*Wq3IWUxqO1JjtAzp*gahO`be0t@$e*u=NfccQ$XXI)Dg)?c{RZR;#CV
z`wbGI60iORw$^Q{J7Rr?SoLb7?2PTP!fSci6a|H`cg@I$Gfyh5|9x{gal_}BB4&Fv
zo#)8~PX(zD2#xDbV_cxOYi49xd$zy7zi1%F3tq<_XVG8WxKB4d7f$(Y-Q1>)`XzEk
zHyz=+4~2iZ={Wi2nhVllHaYR8kv|SbV0~f)VQ%_*l27}cSed0%N5Z_1gTwnH-q;{R
zgNBLA{{2Bk8q_wXWmpV5Y(@2&a@fM*$SJ~8mNE0*bC;bSmt~$byfAb|f$?|Ep=v6i
z(b&SqR944*=Pen`i)Y>`cbg0ZuF~+Ac;;aE-sRU0W-W>FOUB5iHwlJmPaVLfr4C<z
z@v4yzEPk8mW=-u^2Ec<Ty?rDn@<61CE|@5+3xNQ%!$)l{55p)zz_fnX13chEKVgD^
zt%vFBYD@(wLw;EP!4yviFy;p8Nvc^P)0z7~A5`%HuVOHF%fddJ4$ItQPhiEPRci{@
zG<3^b_VYAOTH`OFL#w3>->IS8D`^I%m>DikYw)z-P0D7-7JihTE$BJM+Be%h=7R5D
zNv6wdE{kXqWh33Jb<HdbuY)kZCr7%ax3?q#KX8MdAmc(kkBP!jiB;X)Td1IkI$;#t
z1b``JU_*YG6E2*anX;d_DGU!4n6hc1i+sG{#t=Sa8UR3R>li-YCmbg3fBBd%52hme
zz^{LUgJ0iXFOL(|sxC-US>}&n<E5qU?{7^224$?P9axg<vu#QC{2C;Vd3Phgn4aQb
z%MSulWR&vLWWZN~hUu+@Q{L`FG)(cRbvK~9p4vvAYnZTjAe?Fann_7uIHnDVe+Y<9
z;=13E!N<nN4i68vwzft#20i;pIz*jG3P8COOuO4ojELaAaKZSK_OBv}y5>-g__Gg=
zhX_RS#>v@10IRt2o3#+*fptLT0-oQZmVbB|_lpcc(~imduT$PfZBv(*;jZ-f4;5(?
z4rsU(7<{)$F6R;3wmmOQR&2tT?^mP&iR3uXSqnfk)tU^>V*-banB^b^K;1eB{K9cY
z_7zx<&!O{|gPMgY_w0id((o+V%-Xng4Hr{<YBUC}<2fhQo@39y;5(e#d9An#+(v$$
z1HOwz85X7HTxVJ~@BV51oY>qf&Yzrm5TILd(k3#8eg$qxY!!0y#=+nRtf1}4_dFUC
z>SyiZW2kYN0CI}fRiX!aox=x@=*C)jP9-ogY+5%@3sXe1zw7<t4&T2(^a!ntKYsyH
zIBbIFMja+@IKy_G`F1P9msROLeLc>E8)WRrx*svh_n7iue}qS7iwx`W^b)+ej_nnq
zzQ)aw30;>E*v32agyLI$9^O&Y&_<r-5|klyT%>rty<m+~rK#tceODor)7sm++y0{b
za77EGMu|<CgMXJ>S>JneLHfJRXNmfJIHTpJuCpt#0VGJp9@1KD41HNQ!dqk7n_aXf
zt3}xbNc1BbsVfL@T|5_u0HCsNy#X$ZF@<Xk-O{joP3SDJ?Z*FX+S2Rfo6COvL^J@*
zc$jg8SsvTNOuZWx|6*Ywi;2XZ9v!dK&YeNAq#1yzSQc&oyaC~5W6$&TSz<78_}@Rw
z0=+3x#(6%RpyR#?4qC1MnLeGRlOn|dE1az|?w_%}*?OUm<pyzM*fgTcLEn|DuB(eb
z(1`j770$u!bFnzj95bSF9O@2v<VXJ7ODG-kLrh|6nLE6&9t%j#G}fNCt-zPvQm=3>
zz{Fu5J0?DWF6+qtn6m?zm`0-PZVdVOn<%%i3{S$fXk@4gzkdXfp$SVO18q`1^sCk7
z=Ty;Up2fe7mgx1Ww)Ox_pmF$vP6UjMjCM|1#H(pB3R(+b)weVBW{LJ9cIy(y%RY6<
zhm<`sCS?dEd|Rllq0#xwyx*^(oZ;bLo^VhhPB5D;iX(1MXNO>r6-a$=tw*9y5S(DT
zmms}3oZyM6X6HQQ&h7c91Hh5ez*`EmQ*%EHHZZEaHrd{~!GxqdUkIK21Zr3w?;X}#
z2Gn&u3W%HRZ&?pttf(o}<@f(3ihZ3cO`swz%Ij8Sp7^Ll_UXXzuwt&fibnsVxUSQy
zEd4J>2i{5rD}FDh&8JbR`<0||7!{g7IIT35^GD0a=PA5_uFIyY8ov%Vhakm8H^)H)
zojvY9Y3-3D43PS+0I6@+bl50f=tDUz{!@%6FI2Z;l?;)hTwPseBMkx-LQ!7Ahy7Kr
z;OVJaw&v!q4B2ZFht6HYp;_>~J|-o#km|&12ha0|@d|tASj-ey?-}rB8v$YvS|38{
zSxZTN!T^%+m2@G@f=N&`^*b{tiO#Io&)46ioC<ub0JtjvDcWJ)hgtWZYy;laSov4N
z9>eWNd>4J0ZaLMF4ULnTtUn!AmJOB+)~do&&_}{>e{eN?Ch^WlaIb*wAsn&k2$Y^l
zsW!<Nm`_7}_kbD}sTHOi?6}p)09qBP75;ljHg5uZLKoGcMs)b$J@P{b9#VIcesds1
zQi<4tgy&7t1gD-3qo{>sl3Y~24LhHkotxaB^LL76#KfiDNMV4g^yeop`tluLroATB
zFJ-_oA7#XB|5CUd?sku^O*w;kQV%J%C=U8kKHrBs7FMGwBAP{-lfKR&$Ye;XjC0;v
zVKZw{D?#M7iB#9U!&eK<1`Vj8En)7*d}DqVxk2?TcGZVkNeMSNIWK6J$T7R2&fjtc
zUWG}B+QgrP<_d5eew3L_$vP}Z;@DWZEB_Y^?(3=$C%7q!m@@p#RVmU>iUls49u@t$
zqomFbaZkQ+Etl9FgT~1<!96h<4c$So6)X5P{Rd4=)PH)}aiD73{JG4f3uu#5hJ97p
zT7hBI_7Tt7J*j6FECk#SWt(bdfN-FUqoD@0I1O;tsGg2}W@K$`JvN@#yRIsr&{0W$
zU4h5yuipUF5-)lYl~fs+yHKDg0Mz9{;Q012yr5#*3P_D2*a>D=Sc0B0q;LE_`s-x>
zndZQ=$j8`55{y1`#dARsRSqz3C$v$xW>~VnJJ74gBSHm&F2TJ9SJr^iFj67JHc*GO
z<k4sM(nKvt|1xZ5w^*ZK*ID!W)ni|Y9L$KHo(uRNbe3$E+rZ;+2<bTR(@i<Gvye3X
zsoHJ=@MrbeW}gH<>H!KJBe9eAl+)b#x!Z9CRCcl}qhZI?4QNC7y^i^=MtdjadA}f!
zC6CmU5HS^jeT`-f8Xy@Pi>MLcmMN!2bdk?y$KJLP>ibH>lu`3V0Z7jD3Hp58(+lzf
z0eJ-OwZ$odBYrFVv+$7_z$h*35I^HCNdM6b=OCR9`a`=LP`;f^($13}mAz*1T2bCU
zjIZ%+jB6`=-mqgEUK)4>axDL`Bj7tAZUqw8puMWOr|=P=1R(*2<iPNsq#o!%Jg+Zk
znQ!Hhdcr{Hh=vp^9lXn%*~Ji4xGIUS;7E}t^h3(4v-O;uILhuE2=(@JVP-{+;O2?F
zOlzW8T=8YZ%!6i(OSqcj-$3+sbn-hN0eK|Fv4gfot1BG;Euq|-s{whwQM4CWLbX$E
zudn1UOSYQ`&4+*}rY=Kli?qrQ>)>v$7q>YVNfXKj{9Ot!c*FU>J&Ova-}x^eJO4kJ
zQGbF?0<cg2&-|y70Hig4LZUE1Rrom--XoCqvY0ZP*03g!hKe+8QZXO6ZD5%{DsN&E
zqZoCyUfBVvm0DqkbF*=C<CRl&Ba$RS^KkWVaq;}c5dxHS&}=ER$AcCdEQnAb3ghW0
zQl`Z(92my8*ZjB+l=m}bg`p23_K5BP(IpV8RC#%C0}OAIzHqCr1Jc5osw>`^vbUXP
z=ZKMzw6oP{$U0T$)VWpt%xYtFaZS}1$MME%)qUfQtxAhvQ!Mi6?$vy8#%Z@NCk30K
zpktId`;ZvT11-q338GK}Q>%EvekJo*&6%J$&PHCp)KP#JynpZ3B}@6xG2Ru&jc@ZH
z3uL$PP3J!eAv=%^mwE>TLI27GkW~l>AX$mf@pcC~Kx-gd`2(^vWZr9l(2>YqwLIuw
zKlAr<){b<IQiB%N&=<dcnPy_<8y~^@(P=|DZ(`qQ_=&g)KaKUyM^E~<-+@n&HIfr~
ze%~DPorH|p`)_#?-^^2>Pb*jz{SgVo=Ffnlm!aBwGA9OA^>7_2a(kP!n(`BZY-?oN
zCsWh3og4OnwyLg9Js_!gi@x<~!MfX=jfNP@(wnG;!acE!Sa2@kzH+Gxwyu>??)OBn
z3BBh-ZE5OW<(hn*+AjJW-s2S?9}o8q&+w<^ZF_qKreoZ*71*3UY$c?&{i}i5A7SX2
zSX^Fod8#F<h4E`VO*YgeOFPiUbNJ}C5P$+ofc9`JhHbN2)bp`J!MOPwqO}F7!U(zp
z@_k2Mpd0rXFsUpdG-EisFPwSoKoEeyX3gqTpC;YqLfX@~-E-uPl%5<+tas$aOUy`w
z2_|Bh{_VpWlNaW3cw4ygqXB|N8y|~j9`4?kC^Dx)5O6@Jl!^#oqFwTA9x55%G+XEX
z&=MzLQvG`Z&AlbvRcp%z$Y==yfWqHNd$*Hq^KJ>m8CbqQp2}QEZ8-erS)4>J-8h%%
zOKl8e;tNhn#IAG26tNakWa||}G!4|duiyeeXBx*QZD6%FvX961w87{&zRsu?BF34e
zpSWaKboDxUu_Jzew77;Qt2|9B^PPY$_|quGQn77l#;_!IP?xX9w#vpU{e=?xDtg^*
zdP2wiftCTQ6(Mn+HBEqpf&;-?EI2_jX=~-~w0mphFd&!!koi~j0wB%(Ipgp>A}D|`
zWpB~b0Biu32Oyb5qz6yGx$a$W^Ev!i`GSi_hQdH1=y~>gv0cRm4Qmi|Q`{1v&3o~b
ziw|BfAd?#%Pw_xFQh5tkclzyRro3-_LsHyRW|;8@_OadveOI2W-@o7nk_lGY|LNGq
zWDSl&s>p(rLvJyZ;GF*xo_f3MQ9~ha*a4`m0e2VjJ_KqB>-xJB@!TlRfX})O{*AW=
zg?+((M98hj5S;U`8Es9p1XMh-mRkVUsw}?e$q%@0d3m|XER`SudvNv{mD#~yep~Wq
z&di_cAJkibA*Oa~MNT2gGJ~uXm%pX#F4Nc+|9GALcQF|J6AfDX<`>L^IGZnLBo@;u
z_Fa-6f&+6Ht%w$^`br3cRAn!LR(yH8T;WG4uz|r|t=ygR-9Yh0;OTk|WMDwxg8q*H
zY5QL*e{Q!l$kzg-e*(5O0x66^@+=_KYcRfW0{`77)jR-=C;<3HDkYfdfJp!Hm@YEw
z^~*U&GyF?YNsppOx=FICeY15UH4w&gEl1d#x!`=L{wU;2tN?)u<1d-g-&iR{Z8)0~
za2f6#x<h`&YX{yZX)JtT)f{9n(^Mndaap`RU^C6t57DBO(}jxg9`{Y>4lq92<>mDF
zlS=eR)$)5s4Y_DoQ~v8<iiK0jv`0$w>Wuhogz*~}-Z`V7>}k3LJ+Vg+XR!UnF~j4i
zGOB#~1O4W;c9!l<BU<$mZpVx19U~BJ(rRXmSi{#DG{r5SAYoIkbg3UG<{_M7Q^ql$
za)D|*ZLHvBRYh9*+x|_{osMjJoJpHSst+=tqptkYNgW+NP_bTcNZh6)Z_Yzri$*1a
z0Gd^i*8TqJZC`Q%b`iFs2AryASi2SGNkMf<?hPiy1u-wEzkxHFK;hBG9if!89d3n^
z39PgoxAe|b8~CkR`&KFqlk1_=2)=p6S_n<}P*}@BAPMH7HqkiMl#9;;Xcf2eJP>hp
zTKRew><-4nM1@7dq6Op13|;_!5C(JFD^YFyxSxE<u$9?N{&+z2RK0}SX2yzha6L<9
zD&b>rP$b(DftwB6E8}+=Iidc{@WBCwW_uTVqzFs2d8uixd~s9|4DF)T-QwK^zvgG=
zO3z^pisQ7};CtT4nKJ7*qdy<+uq>?U=Td%}scm3}89eLf92SFjDPP7}$SH#}#^V;K
zWgfLVl2rv!Uv*25H|yCd+xeovhX@X9lJHW3jF8iNb(o1o6;KI2A3nm;mEF=dZ7T$6
z73gC;<1!^+feO^kF2#hrBf0w=j2epbUvm!4>&(42br=j9uZI+J4?Qh#Jj}(t@Z=%$
z>7#FIK%UmgqJkw<-;Y_k<SzVTwOGEF5t^z{vh&23oE0ux>K!TVOYkzqS%6L^D1e`y
z4gln$36cQCT=xljGfB;Cor;3EbZ8N>o-AkdIp09}JLSNOCl4@u`cE^*wnE3UC6v?g
zhTCf71Pptc7UL1pFAXuJS!WUml<7>a$j~0<Og@#TbH;3!(kzQN%U!g~j=oCOt~F{%
z>S~Tzg*zFzmLlsK!g-DCIr%<ptF<ePsJnh7k>r2d&d0cN-lv|GTlH|V{TgZXkdoE^
zr66(`xf212?tl!`H1Aws3AKa}0c?k@Yo=WCF4~t|iNQlmB*8UqvB`$t`myiZ*~ue*
zJ0LfD8dd*30}i7nf^P3XsPHr$(YliQ7^|INbD<QN3#XvU1U^y^z%e7E@vPCtA?k8y
z<clL=apjuI%9rEqXNH)S5X8NKR+nSDB->Z}8jg^qejRz%4phF2S3S2Sd&NCxz-bzl
zM^VDT&eJEbv9b!1k=S+Hztu2N?yA>VZ!Es#HG)r>IMecx2|0?dha$Xd{i~;Y1B~kw
z-f<?-<%~b|Y+Zs^5O%rYH*6J`fKD&>4vq>0k9&Qt_==|k|5fU=flw~%o`-dP-d&LI
z<qdZLXaotGtcy@?D|d98k4d}j08w8*7hg4V`swBho4MEL8c<D)ocUNl=C=dP_RoKK
z1>vmOXY;>hBoL`j8$c?z+YYIIy)Ai^Ez;&c9rAM_jk@vN%uBFoyK)ZMP5M%!L_v8K
zlt9>Ntc!1?^(!bO*|lUvEmS4J-=N-V`tW3?&7C1tEZy07(St;#e&&Vy#$$g+NdSvE
z(2_TBoD2m)P@p|E=qffH#?^MJJd@<rR=$nr`Vm%MGM|R`BV4S89wysuuGwY6!)vsW
zj>_+7S{E~D%W<tNFsc$pBCJS=RC_8hw&*Cv%Lgr>G1>t14?ur5rwHSNk0k3Z-s>*u
zu1nA?7kZ|THN3RD-rEU%^2kV&Bu}Q~i?!%#P&?GnJbPoM@72~DSx~YHCysMWB%$AA
zd0DL_{YW!kNm`?vXY)P}!foJ~;tuPccAC#~|C|^G$)TX`YS20d1!YyaD^j6ci0#@v
zD$&lMGbvOJheeRL(Z1vTlu32(zwS&*q{e8QB*s^m!JCnqIO2p1@XY6q_N!;D%&j+4
zTGGgeK+%&0DTejK2S(5Dm%>9xI>y+)NMeiAYDCfHEPp^cgHKhjY^Bzl$`ezEj`;0)
z6GM;@D<-KtPMVgo!3W254&+~<%Q_5j438d%N>`v@r>`?IGe3KH&mgEbE2bpq3$k_t
zrLPqJGA(Dtu}nS0oi~x3C;2WItki}KvVa(@CsIN8AE2Tz>y|5?{8tzD_Wxc*eEHLG
zO3SMq<W4}jC4A&Rd<w`y0pfGZ#LKG%>3D#u{PS|GP_6?>H!?|CUsKc3pn*aVN074f
zybOu#-ZpBR!|voGc27VlBB*z*n5v$`1MJkq!~`gjT562u{_|^I^cliN%qi7<^$93$
z1S=f9=pk6)zFf3TsC2VHdvB9bAcc*60I>Fp?rCI5fG#`-%A!+wo%);^e#^!^*-Y)H
zb^>MsMeT=qwd(&{k#6y~|7ro0Isf0K!u|i}?}z}LbOuMU$NhmSdM@Qe2)?`S#TOFw
z{4)y5X+hmFL%G>cP+Wg`(dM(lJp?M<M>FfuKnVHpo@6SF+Y+662y&RAF=E(`#+0#6
zoVp%(zOmqQjF}B4k=qTOQb8+>fe1aYwU6!NL%)bv>?vH0kB-K~Xb%$>@PzbXjK-&<
zl4M1;Au&DU$Z_&$+jZhLLy@mJQ8=anXK987`8{Y&@jsmj{;MxQ#`T|q1)m$QgCknO
z1L(rvvW2P{Qk1uq*SibXySmq_x+_8zO<VLKVZwu#%R<*VsnaJ@-n&!YS5rQ7HXcf}
zZD*a#E5o2cTw8CdBVC`+)wmFHAm5M&0U<x1jL}?>fjjuTrbJSY|NnRY-_C)OMC<;o
z!b15M1}M<Y84b!VEc8-<;Le~S2hwvZ3izZ8%{DVM%o*RrX;8d%gTCZ+A2VXsF1#z#
zZpTZ<L%)d<3o<B3k#=+Tk0QEYF+xbec#(`qLEW^oW-*9HKLkvhy#Zxi`xkY@YIm}&
zVp{vi88oREPK)e2SAwdO+~J@wbtbGTKPqHbeFk4B*wu7^Trj|#xruO};k?q~QeZ$N
z0q@M*;xe7NjTI)#S7YJih>ntZ|3`oFpvlYiLKZX-yLkrc6X5;aF;J@<%p2QvHZB0p
z8x)XxL<`F0I3@3{#YUaD!{4OHhcn^GdZ`8D=ezpU%rRO1U?Dm9Ycd=0{P?+#qUr)(
z=Vre%v8>pMT=g`LS01S>``KN%VEEch?~W%6!|!p`mjb%>hR#zXT>iyAnPuzM7@18q
zisJq|)%Il7HK*w^HPcNs5nfx}Hk#H9w7a$He#3A4Lul8`U8Ay}Z}sXLIn-&YJ`G?N
zRWPHUHiur358pJiW5iBrH*>)gR(AfVEqUmMUk&s;s=g~Ek2`T>%<1jYc^X+bCW#LO
zW3m00m!QkSPhjUkuLU`Pi2ah=S>W!-dQL@t;pgz&hu&5OuZIB{v>OmR2de#e-?vU%
z<@-NKUuXS;YX~UT^mS-PI}6B#G~;uJV~Z=d@-Ud3FMs)){?n#@nHZFg*<}&m|Ni<}
zjxz~uFvUR=$PWVlc>!`rYy~O)ikS9?Ne3<*sAL0AFW;!?4JW9-oWm85;(VoqFs-)8
zn6)bSf|AK_Ns^ZiD|wfg=`5m#unOtTMB!DTZmWS?aGEy%7K>3fk3Q3O$!n|j*RK9h
zW+>H)g)WQQ*wtHzI{xvHELX#$2;P_G?h~veIT^uzGZ6Z3=9L9>P5d1-y7Xh->uZeW
zln4&<enQY)ra>diyWlQ=L&J-we+J(zstEtkst`i#uGCo|wwM=8BDGN!24}2dH$As9
z@s?Rn8~ymR!nrO3a<tQTRcLzT_u+F!zR99%|CIL9gA}iCW|*ep5lqM~BlL1Fh$30v
zSb=QgrP6UFu78Zcy*O^EB|k^i@@)&<835trnvDzbdEi6Rgw|Zk*>prjnk$wc<GY)Z
zmoJ&S2aR3)!HuF6F&hJChS$Iw)s?Si85M5b+j8U;UBIkqTT5a5&WqIjsufj#G9G?i
zuF#hwZ*r%jtgH-K!Acfe+ndf^!-B<;<zSG2_mRZZ$;k=u$DSVNM5+EIvsnY{-o+bE
zJBjv>L{qoee}6~Arodo~ak;;X3a6L&24n)Cir=9o1bDwH@P1#qM9i5#z{1L^WsDXC
z3U=azx+N9c?NW6rOiAl|9Y1O0vAl7Q0q2FCBoY~5Ft)%s_3|>`c(SywIKs4JT)uQn
z*cT&Jv`T7?ZSKbUDn{%T=I<(It4bcC#O+_OiL$<$=}=;OQTQDf!41b&?pg}480gse
zl~tob7ek!8=toG?;iP@!0COiQT{;Jej<?2VtqTpM6BhD&eD!p}AR=S%?}zn!MHpxx
z=DVz#5x90x-QU5F@@j?OP{IVNR*f&_-&+okoa)gm+)=_(#wkjuq72&*BO1-C{(zJJ
zZeholT&jQ5=w*Svoy^t18u<cr$&XO!F_>Q5=i<saqe{%?zT_K@yg*d;yKv~#YxoFM
zgg?IzY{bkiXUZV?WBb`0#13c-P{~z$&}dizQt4q~pyz|VQ|?*8E(i@tRZrP&OHa%s
zy4K;~2RB=8Iz{$PcCFoOIXpC<x6uj46m+Wzqi|$f1=K3dvfqP@cNvBB02mZFvM2T6
zhv@eeY=1;atQR#HnoMEv{#Wph`V>+injkV}lW+)im!wbu8_r<x?6#=eqh<c>hq0O8
zq~c6#+zJE%pm5tOiKsjb+oQ3ZA%$w6qKsPjc@RgBF*`_M_2Nh&O!L_y_6d#!8dN`1
z7R!1me=I}Ah7hKeUSdrQj=v(PAUo&D3Z63bAGculBUfs}kCoJ`pD}L4lEWYy&<kny
zE;6&`c=_VfJ8<a8>pLOCe^q-ByB0VN&CMh?dZXLNpb{XI4AbZi1SI6jwWib-SnSsw
z+1x`G{AIi~NpHyanr!k4=*f;ftje0R2R>8Mi;)J(Nj(xp^qP8m>llF}Wj<Vc+do)7
zZjZ)H?l^Tha1*_H7`2Z}rH;>dpKL(sAxGlJ7_c;o+|p_7>B{cQ!AMDO#^ZWdURw;s
zMWOX+y|&g^_MwY<GB*;pFE*Up`3Pb_P~hFiJd;TvA(4r;Vkag16wTcBy*-mJ=tII1
z>VvPQ_uW;yw|+EOCqiBXqClH}D3wZRIBAyQ6X0#wR}h}HK@t1eQ?hXHOA2@wGV-CF
zLHIdEPS!uTr&yG6dY|D<^Ths*VqzqWZQ}~9k+@n?mft|LjXtP+1&dlzX)V;J@<VyU
zR#Xs*(Fk=dpR<9LGpJXDfcr{)cMa<WAu70Ukn5v}jI=~W`4I~3b7RI&GBNv)C$10y
z$R{-xlY*GM8Jcpj&)gniur3ogFjBgvG1S;PouM?gRw)0%R95uvBnJ;aEU!2#gp%Sj
z?h=HndmEEowq$bC8Ag#l&IvOADe9q%uI2>W$uGUhc`9FRnZ%zo0p2J_u|}v;ZbZqG
z#Six|FInJ)QZHzlRQOdnke=$Dt?^-zY>CatHN8(v3&WICr%dU%^hWA{+n=-2CoANS
z>bfgFe|~fDWOJ-8R8jI<=9E~d(_p5;PRUh~6+@k7R@h^OZR~|pc6F9}N~BA92rJH#
zlo1{<iS+Ue`+Z!?JB9UQ6pY75vnzu8!M*7YRF06`MD9m!=cucnu%i1Z)&KgD_sQ|G
zvoEMnbT#EiS&9YK&S4i6rW~wr2|8q~yS461D&vSpz5m!aVbQt${v3WkHnO>^=<nG>
zu&1kS;H+YV{ep64+QngE=&aKY;WJ$ec^1`Ldd|{@scv^rSz;Ti7-uZ#3sbWiu|&J7
zSSLBcn);g0<d<-q<Xi}bH*br`48-vdFzr|f3a`3K*wDc)0JmIvKzt*TZ4V%`>dYS@
zdifieJBP$TIy<%UQu8Q9amqo($Y;-vw;K7`?Knnon|A$D%cn^r=6S8qZ5yECArva_
zK%+wZ=mU*$Ui&c&9a*e&P`!Nn1o~r9Mp2v=yn(mV=C5W7k9w%Q>G0_VJpLqRL|gvG
z!(iypsHJjD#HcE;3Uvx2ZqoC*q`ZTg!W&MBr%rmCXx=YcFnjkl?ejB}uj0*8+fldk
zorz`15pS)bU}f@`B%`2Ha=4?=M#3cT)d}HuE3E3TF(djGTXrq1FTqKCR0-TY8+eQb
zhE45Ov7_>j!Jd1p%%_#|We8{$m1x!JF~;_4envDSQeP3u)|$wMY}toq*D74ld*q%t
zhzEK~ZLrYnGUnLR&c6c>o$4-T>=NQknsSh|$yO*JN~p~#n5h33JiL<(vT_tft-wV8
zd}zqfJE?TuytSl4y~MaGZ@X!W7KFmJIu-aCh2_CH+VDUY#|ez%-qm0Xo*<*tFBr=s
z^92&xHiZiKFP9BVEoY@9aVmS1?CITwi6=jCgIgqbFsP<@RzC!92rjlaoQ&x4&(ftS
z8BKI9K=6Gbv5#)4Cx48#00Hr1VazxOQCP=9H*Imrk{SMSV?XJeEGN~$`F_?1_ALG&
zH+=}l-j~X02~t%Jb$UT2t}j}&pgUQrYASN37f=+%9tq!91$y|*9|M6-Mk7Nzl-g{a
z`>G6z#6;(1<>lcuZ_Cs>4WsXw)Cl))(sTWmk!P$Hl{ntkvd0T8_8w3BMo~*(x=~N*
z-#j>hWvCs<R#Z&9iQQ8g6|%xx>0}=j@XRDJ+<h1mn90l^D_p6da4q$jB1HIuvpThq
z8r6IeCr<YGrqPbP=QXj@pD!oMCL1e`)0H{BI6bZV37l;zGZ~nY#cAKxtwN1*zc2-7
z6EU|hU0fHM`jMDMuw{IFy4Y0yH$Ub|Tj7Ln=rx#4ienGS80Htop(2w-%J{uy#zjLb
z06D~PT8Nl(S1Uo$P%@3shLtVU<61<I?dj0kvs}Q|mQPNoD>*wI14PkG!_02*;MksQ
zi`!**O)68)_i1@t#@-Ey<M1k2C&l!lIio6>xP-<4%`plP;nbz5@Gfnz@mqCtIqgcs
zU>DI`@Z)Jfu@#hq-Mf?I50#3x`>jc~WH~%!r^9M}8pDDk(0$!8$TkDm&%T%bR8(p7
zXr=GkWU~8BkY2kI6<?AxskMpn77Fr@9n8`l8gxtRBEc)|EU|nzmIXBP6{S(VbR{1^
zHz)ebRs8EEm5{$vp5ln?=c0R(syD{}*J}7*R>nm&KQ*7(d3VLRV@6-%a$bI|N_w?-
zi`Xgmg9TG}OgvV8UgWEaaY>};SJ&9nP25{o94VAJ|CvpGe^<w4EiT|zxwd`3u6+Ex
z)b^o>>%NygKl_Yl0}I}T*I%D3oR!%=P2qgbn)U^g_Fm61n?8Md`nRR#eors*)PegB
zN3-@7oxB_SO5w|!M($Vb(_Z<;W?8GUFw70>yq06O^?k&Z_jSJe>Vv;e`&jB*Ryk{z
z%&QC5rYCtfOWSWb`*F9&o#=0MZFSx2?z(1Xiy7^kH1&K`rgZ-FUqy?%m)yBnAN$+j
zo}zQUko{!-y(fE0mQ9JZ%=p)5+<d10@XeH<DW@34)ZHy5=52cAYyGyr_;cisgs=z8
zR~lCD6Z%vd{?5`w{a1wA!YW-h1La5RkJ2{Jd^|_)zW=f3fu^0_dx8%~7!-!z%5&fK
z@Pgp_P5xVSg?|IjP?5SeGdtdPrcFWjmGjqbB|lM8nRxQT8%Yzt!gX)A`FH&k_`PNJ
zj5!;(GH>c=n?ASd&~$aZbFnuiUT1k&Z#i`O38NdIYKOS&$2nU6>?Ho5x^o1nssD4A
zcDYmd(R;Hy`5k~8#sA(=@!#5(e)~=Ew+l0L>h@dS%e(4-_}tSTom_k8?v1Il+w(Nv
zJN?nwWq5z~q8<95OD^fXZBQ)oO8MlNK5czaTiLmsvZ%h7tBX<?i#DB`S@l$H-{rF*
z{5>vm;UyD0e_T{Gw!ZDV*=M!+ns|{(%@4~%lx1y9|A9lu^GnUmp3AY1d&1WyEjMLL
zu91L_To~-r&H<jH2nI7sfmI(EG^CEQ23HvTxBr&9b<@++G8&)=@^tlcS?83{1OV!1
BZL$CW

literal 0
HcmV?d00001

diff --git a/docs/assets/design/hybrid_kv_cache_manager/overview.png b/docs/assets/design/hybrid_kv_cache_manager/overview.png
new file mode 100644
index 0000000000000000000000000000000000000000..ac80581f491da6235c53f14601b74288ead3b185
GIT binary patch
literal 39501
zcmb?@WmMMD*X2V9NJ~qrDBazSfV6;=q%=r}(kLJx4bsvjA>E-Op@4Lkv>@F`y@&rh
zYt4L`nKf(pz*2em#l0u?-e+G!)l?qiV3K1Z5C|Lvc^M4^0$C6KyorGf-yiU1Tp$oB
zL<%yJnx1K!vu*}N>(_|g{$4xu=S)L+{%?`PCT3raC0l;{=+-$o`@G;|#O!P~eIPCE
zySD@yLoJ(?E!+p^W*1K0E&JQ?vOaU4y=HAa($d%3$1F2Cty0iXB@bB_l-I)#hnfi*
z43v>GRab<i*otEs2Bs%_rzfT6D`<@bu=Au*2R}CxOeRP3e9_&G#39!u!|=4jUXezu
zDyS7!pXCibyDQ66L^btevU5mc-szqZOFZr!nG7y0y2lQAr>~~Y>h%T@LR89GQ7>2p
zf-7{$&~~{>8H6~!Ts`>dZ6Elc2YDx}3CZBM#Ap_Ke2j9~`a3f6RkMulh)Sj9(eFEq
zlf<e^Lw;n-=B8hUJp{T*F~&U*82d+!RLSPU1K2Cy@UqlmULHTH#p&A-=^682=}x?T
z^UQdmO6gj34ht_!HJ<*9Tsg?pe%h*B>Jqj>tdZAhCfO>%Kz09}Z}X30BCNCE`)mqX
zxZ!wG8r50v%xDR6*Axz^u=a0Bt>a4`=E!985bbgJgzUZgiolj><_cAu51=)oy?IM<
zS2|gIflTkyC}RoydX<or#F!|&|J*$u{aW;J=3WLuF$5!!79()z-p%(+*LU!p*8ji%
z_CJ1x3{P_Nng8PtkMM&{+S;C<|K8cTdcZ;wu=E|hYajh9ZFeF%+_T8TQzWh9krg=y
zB`Jm@Nw{$Z2KCNtK|+2&$9+^O%JzAxdl-T2v@)iF{q%p@rdcuDIg&L$4HjPW)eZ9O
zdJ05Ejk6KaU$tVOo>6AOa~@#?o1kLNS6g&cm3Q6ey!YGe?=u!GAJ6N_;eqtjF_%}U
z!6uZrhK71VE<+w2LAO}=ek~Ca$+o<E<#f_HYq@a8VB5&2kQ009V9)%Zz);9VlkO@t
zs!d+TK+w;9t@Fqm>Fd%$vcu4tQo4bb?yuF%9h_yXw{v3{8d1gqDlvzp%$D5qxhc0z
zPCoQ`)t1%PT5D<;c<y)i@7oYd{+8l&Km77**Jvj-i5wGEXKwmp!Dl*xdS<1`MAY3u
zw`0|qEO<dc1z%?CAB_sWKqLy{Yv6{BuKOwlAH|$FI=xM#F`0^nqt(vXWfPxfkESWg
zruCmURb6;=t?yS3Y^NOd^w!LF%B~-emt67_e7~mr%JH7o*S5K%+@r5*=Q+Bh7%v1s
z?i?o%Pi=*Zh}Zg6aKeGdh4a+syrx74gsi&t=QvOAXswvKvh)l-OY0oD>`IOGcfM4D
z26LD1l87me>v{HQ-Dn9G1Oj5{7>O1IB8Pc)Z=nwGM4Cz6QBn6dOUjD56Lm`{J+h}G
zx#R&pfdH;hx(e<0+kvz?I=Z2BJNDW&N_t5ds=uStPLUD&E-sFgR^+GNfnDv@daOqN
zRg<U)eAGuocjy$H>jmOQWlWLV{pl34-U$ZaQ{WTH!c7V8z#5^l;V%lM<F-rC(jgEW
z@B6;Wng+f&ej^jz7&vEtvlj?Dh02B*Bk7lYDTiB?ZcTFUDGUweriCZ;d2J}~9(PM1
z`ZGl1w6$H=Co^)5j}~8{N~Se?7gR)C*8dnLDVvch|2@@Fd?|G1f#*Em__9>Hn2+m`
zlr;5`-P-8qr{M<$zZ?Z6$|v@1Ch?wHxLL2%di_PiMJEjn4LPAc{JS|`>NaeCu*$H^
zB}TJ<P+6nLn%einC-q^R<WP@J#RLbxFd;(b{-V}mvHF|W@aufGL!Q=JvJ;<AwAqtg
zT(8;F^&88cb7gdN2Hn4y2_0^kK0mZst8qBTC80pi<SW&i746RPiarkz6<-oi#-Esa
z7%p7rN-@Fu`(H;-{-f__aUqf0^DRE@!P}C`Cp%LN_?bfYH6BC7ihljrkX5+RuM5xR
zLvNaRi=tEv-9vmSmo{znJ6Qr>YQpJ#F7z52^jhEdH2Q4K{G$qad;f!`q76^H1O`8w
zQ99pMs<!=<$M)RLZtIp_>9MlSuftGjVYk1-4UezS>&k3uXo<pbHz(7s`8;-aD+(gF
z<1~K`8MY-0ySXNQyv+Y)=;}FNYeX}{x!m3}!61xonL)y8MBh=+{5nUy2n~lO^`Fq4
zu6x`MykxF%{TB%Rk~|JJCP&7L$ZP&}x7xRD&K>r>b0SrtVP#@yc3r(lE6Qm++Mebr
zyv%CR59s78F~Ik>I+;+I$D`!482*$2A%=I^yfl^fv}erT+10wz^Vf?AlKnZ0QUiep
zg7@ayi_eb23*3Zw(uj*5u=t?+({?)~<;%v>A!RCRD8%|N2`Iz}QK8muQ97T<X(;{}
zE{}foWX#181FyxwY;B^_EmKL+jLfmVUJk{p?lT4dtJSOLslN>IaPWMLO_}DZZA^N`
zw#FTdinRlj<c1MkMc?~onNJt<hD_Iwo>*!!tsFgBOF>8FJ#o0}?PzrVOH{AgE^kC~
zV(EOY52cs2#IRG=me+=!|AbBSU#ZUS8@*ffxH}Fb6UP^Q6bUO+gdxXCx;pVhD=1$&
zpFb9MbMU$xT2CwCKX-q5>a>v3XG^jz?)&ga%hXz7Z>xCin7V<dP&IYpSqDE`x^~gX
z$xFi&F^^YH>I<py&&=rimJcLyw9jkoTrL(T`K@L5D%9x|o)YElB;Gn>%AC3vIKoa>
zWLEL+?!gCinwb{6E7J-L0ZOS-<P4GkJ6y}2sSX3&I-i1V8%!qVpE9N>F=RS+y<egW
zKgF-C5Xl$QZ3&+YA<A{^0wfBx2S3LxqT^CL8O|K^&$<(g^vAUkUhyu5gobXG1|w0}
zM%aGr^4t2j5WjV9<3H=Rb?F8aq{NJbi>9z#H8#VHtGW~Gzm0@Ec9w>6lQs1c5|R@$
z5~(Kg<C0K>H0X)Kby17UY3qtYJkC|@!!bW8#D0p^{JByY`P{1*Awc<2jgF0--MnIQ
zyU!azyE*Q~qwLbqJQSPlLXVr@syKOZFu3wJF`CD!I>lnn7h!B|K5A(!9xKy;U-ST4
zpS)<GiNuC$?9VVhswOHSAN%WHZZ3XF_Y$92r@VY&zqmfSc=1DeOX#G!oPd!CAAf?4
zO=xj3Ih84Ib7U~?IdUr9_s6_eFI>(y&axFJQ>({yldYW_W*RHn^SB=$954|q61u;f
z%2JCR<+>Y5s7H<+@m?X8rMu7C%D!N<ko5E1)OG5;#9szU;y&7PdWy2Iio#Qy9&^vz
zBv*5Y*J-8xvRKWh#63ByqGv0kZVR6AjeGhbA?8D3TtY@_jwVA1xh^5gZ$|;TqNhdi
zrWFej%$83cjyen(<`q1mASWk8_3G+eImgSGX)(`>sYo_Qc@o%Rv$#;>{&!n0qSR9P
zqubRoaZI;FskS<iRZVVl%E$>;ptr1=8Drqc10EcK$OBJr-Gmo^2B^Q1?kLlUYceHl
zin&tKXS$rUo;W<G=6F#phoks#wDI}CVMC9F3i%_VF!@+YUK723-<C<cUs3Z>jUMM=
zp`mv}`a(VT9rsdda?p-!mbT<jMr)<{k<s<N9*;kwW+EmdzgufJYpD0|uCTS@i-~qI
zD*bXDR7CHEb%OyLHy2+`wap8QuePg8Atp(sKGJ~*cB5<hsBJdgs`bv5Vad6&;Lc98
zV7Zuuwv?n~Bpe~Cci$<wU7iFrzdjCJ8ScVsbKH0rnw+ds(9`#8V*enfVYt<6|L-LP
z(b`%oFV||RPMhsuRGd#*2?7P{N%7vi-!&Io3hlrNK{h&Bgm#hyzEAy~kLr1krg-a~
z7F`$MzBTc5w@4{!2nxu`lJK`QUw*>GY>--y;;)zIn;N&ixvCOFgos!oi;ANE{*Z)t
zmDlN{*2N~XFn^Mgg<F*QK>1UFhf6G-n^5|j95>mZ#aEj36`{#>pFgWSwyvR*#OJ{w
zLnD$M<mp?gQlM(~q-HFQ<vH4|a4TnGj3rrJ6Kdo!srYTL!WM4KTm4dg&{fyTT1(6P
zscPMjse-p%d`(1pka_#hFs>LKgNHE_f&vJ%$-8&&$4Ht9_dL5@<ls`-oQlqnfJySz
z+15r^_ja&cxzA-8ecPPyt22^FX<uL6#qc$$nC=2u=9qW#s(qbK%~>_P3RhpB5k@@`
z3GKO4;dVJCFgE!2tvBxte!e`da>YQ1P-l$>sysG@I*J+@prL?_ki<8jE{xag>mq6A
zpvI%u$y9$Mcd#vFK>gJMN+7%<jvmer3qldGXu8699_6P3k^>`6;rOVM>O8XQsFJ&<
z{!2S-K})EKSc(r7hQBP`;qlj#MMi`?`7KZwPZuKX_$qT0Sphi`NfzlVazZRBf;xlN
zEps?VMC_|+fZi=stWSBL64^~l?Cl;BKQwDy7rZ(bbRoH>E4@ES8Q60x47mo=RVcRS
zAwr_`(Y2!pqhd&R!5i|2__=fHMe+#a{Q$YkY5TDI31QcQarxGdo{r^f$JmTIcyMC7
z3RUVIPu~f<*t*xtppA_ZcAt1(Z%r*%<7|6jLyyyncju*TO|JI2_{xUs&My-J`Y%nJ
z7zi>Id8VEZy~zfeN$5P)FXvzNelNll>@!d+K4aC4SgotX&t<#8zuoi&>Tw@%$kSWS
za;poYd)9=7ga?VNc>D}*0a=j%Y7pHh5gVVJoScwwSNV;c^i!fB=X)BTZ(zJU)tS#Z
zQ{V`V)!hGz`~Po&{y!;{OzS__1bS9?&rtufx4+9BOCzP`H?OgA=oBg@|Ab?moenf%
zN*=Zk$uJaYYcNn<5fb4LBo7VgWbQ~rv*QEp$fi*llfw+E)W|oz^T+H|QU}$LQR@e>
z;tlEQ3iJ*bSOaz=&GNgRkI(zV>4zRkpLaBG+dH%4oYN^nabykt^P&GRv-P>5vgT%{
zW4D-SRbwl^5OF90(W>7IND{T=8*fdl3sWxlydAYGj-ZvFWoC(^ssFfaI-hCIO~Hqx
zN2NUerk^e@`L8q?8L+>k$hUuTXGVGhcV0K?r-mO=k?x!Shh~*DE572RQ_iMMx9_9S
zCBuAWhfr$!@vdhD1<fWwBZ`wlHBc(Ak3A;8jg@zKNm@uSl_E=$r@P>WS&d-*&|P64
zQv8``gTs9;<)>VL8h*gU_CDYGWYu0FUCzO2072%cQ(uHMK0P_p{#D5#hSmuE2EY3b
zZpe%F4|MJ3g_A(rJW8!aD6cbPt|}N;Vfftg_qNyhhl`MmzE-CpKp1@)Uw5@bMj7zq
zz|i0y=+;1$oO?X%pAr%M8R!2a%lm)lhyVNEe3QTKx2KA+-*FSV_y|O)c9Y-rRbe3m
z;`g4X2sIueJ158Q>dX=V`pzHx<v8j8{upkEG`>u+cIoP`^wgM`n1lpghq-2de?%ay
z0A*xrj&!stu5=%eS`PWx!-E5_y+zr+J9v0er>CBQw1;HGfg?DY!ZP^FRDrZ4?1l#C
z-9vd#)Y)WO2@2gI#T?{g?M6RoDJfwiJiWZSzJD**0pJ)!>xMLndk6IlM-pHB@{rKQ
zYvEfE1L=Juy#HLvN6e5~uas|H>r*QV<lr0Me?GFy&S&AI?9MJOZbDSSCjW3B_b1T=
z%E`%5Q&S%wAOFKeM@W>7tR`wbW9H%Ur#7Mv?AYM5LRN@n=l-dXu3}^Y(e<w1+KP=}
zZ}J~zPVMIA=J)S1p)$ybKZJ+u95h0NgmD&qDZDRdbmkYD@kVjmFgrRrQu*w|-oGab
zm9em}K$XlKPrsMg^F&ec{rmU%su^=FULHO{>N;$G_~~&*Wq=UuNCJ@ZyAsjcIdH=f
z*gSd~Q9Fd#{%8XmuH+H5S&VHiSgCTW_kC`;Twh&2Z*tet)YQ?_!wi!03EDF+EGuJU
zWUMGJPvW)JK0n?%qWa~aCUmhc>bYy&`Tnlx>%-Hl^PTkc^c&UWL4qReQG?I9yOGhn
zk?&dFHVQ^~e-mlk`1h=Ukhr{n=&n!bD1yl;sy?SL2V!WS7RGXg8Cep!zkE6StE+L_
zEJ?`C#?|$(H<|nV=&z-*vDyie_bVo5W_F|2X0MZNw&#s@bIrQ25^shUNZIJ<UVM9d
z`*9@Y`b0%et69-_yy8mTye~6BXe5oeVd9FIZ_EipnX$xURV>0*J?S_it;9S(5)8^E
z=iOMvl^UbC=~ue$yQRZ&wN6W&35kgrqF&mx_#@^GMkt{&TraxuNl6n$y^j9~lOT91
z4|}xr^n~oEs(=4Bj>ozt$>?wMK68Hgayybrc%x*XB=A%8#KqU=Daqt}#7qP$-^67!
zs-<L8$oJ9R_b-lQP2@i2{tBC`vQ)LOSUvhXuFiHr1pD=JuKDcZ?7*xmavkmxkK1PV
zcl`4Xgld!f)*)55r5S9=<0$IV!otGT#ci3DYR5WG&gCNB7qjuFXBxKhgeWhuPC8Fa
zoj3Aji?g#AcNf}WlZxYpS~VzfEyv5A!>W~*m4Q0NK;_#&gxL{CuWHf%v2;g5jJ~X_
zOgWXW#dFV;Dqc%X`}xCh-kLhKV)eY;H-)_MsVza8lB<c`8=3XR&pvbQga|SGwO!rW
znJ+hK7xvixBa9DEYXBR=!^P$7>N?^<n~{u5B_!f?T!M@FlckW04}ZS_HR#`tIJWGt
zH?D~tE&SuBH;dWX*=jwCvsSZ>ZZBTsw&qBz@Af35rrMgCX74bFiPzTFuB@!MsYT1=
zlVA?0+__Ft`%?62Y(!Gh+}s>~=%$8mYG(FSLjyhyap=76Dn$AC^Nx6@X@z_x+v|OX
zluiz%jDs!l$7IH2+Wbe>v2=svZf<WnOxF+VeoNyVx}9}M6<b^g^Tg$ei5~8cxR3>}
zW{t)Ooex~z2rH<P_^6<1!FNe0LmPtF?Q^z||5aH$45Xc~L%vczpzd72?=`Q`yX@=T
zIzj%zf!%(!b%GY`R4?OWMIs$N57p~z$f{=QABW<WnMuXolyo)+_LAf7JYMWK4yCN<
z#DotE-;EwpV0YnQ-zEt!oMn_7)-;jKQYr3m3hevFft_P*Ty+~4lJ$MZx7#F!E@B~X
zs&v|jzb+7%a+(;+mkWNMcdu5b(@lCNA4`|{{xnfhQIRL5kxca5D=#n4rbdrl_AixU
zh|y<sJnE9?O|{1{c~27OT7Aw>ENyD0a47hiUmt}voYSBiwn6|8<tp^b8n(RVg>cW)
z5cE3UEH5ulaXH?ZZ+r10Hl^t%pWXhBEwqP(hlgJu|6^oiZ1FlNOZ`ERK@@Cahg;#)
z5m;`tvv&&>VJr}7QtmvM_C2veWbT^fh?QW;GKyvb=!e9+>q!o(z%Rt3m2N%S%d0P3
z>UYG>?fBHy4ga-!%*THqQegVIszmWjPveD^Onkxz6-v&`+<MQ0jSi18Ba-mj2>h^j
ze-nSPCkrQZJaVKY4G+DTxe(@mNi*vwuyth1cJjGZb;3?cPfPHXYg=_Z9rX_P)#~87
zYuqE{*L#b%5Cc4-7QIOm6B9X)qqgT-npV@2l9KK`kQ>RZ-q_eUKiQ!X_q{+VsK&j^
zwLDg;PeT@^SBiW4wxHX_$Wm9-^`z0hFnNT<pMfj|1qFi87f_ICuI^<sg#EinB)fP&
zAK&dqL`>|qGuJZp7&1CJKK}5W+d8GJq~wJ!w+&B<<QudW-^+hSiK*%7KIg~wfwZ|A
z){vamDXY#6{U1e(&n}+i4G}YI>*(lkc+}ZXk7dh-Lrqdn;<(4gw(an_f5jQ<`=?JE
zruoP8K}!O%Lndu=?A`t5RS1L*OG#z+#aKcW$Ym4PE{`N7B<J)a6#FcY!h^<n<QE0L
zrhPF5!A+g$VT|S<SH0DFr?lVu5|2umSlN~x8;FTlqPA!06k_St6&{?gwDwih@hZGZ
za6IXh<wM5^RQu_sKhC<acgtIhjLev{|KrZ=oY&Q%6PBgw>Cso#0DP1$IxK;-zE4JP
zKo)A`5)Td~Zk^tvQU>Og@_>%l*Ob{}n=5-rBQ{476#Pk=d$S(%v(2xD=jO!d>DQKj
z#PP1N7_}NOB~(t@A>Kts9`EjwK~qWMG!t{|A*#`PuBNto&$9LP(a-7W^J#}RDnaMv
zeC5=uQJO=FAgP$d#I9amCU*AgU7FUeD4Gmj+c7_u=Q~gtUYn2|&wcwYl@;B7W^d}*
z7ZT9EU^i1g*wytoPlK40w5y{dFF&7&C@jU~k78bIgSwaTzYL!bIxN~toe_*-8xSRV
zc`HBS=$$Bc26G<E$e_A>yF5QR^S;Veh(r7y$dW!->&L^z{ng!_#AO~<RK(~KshIbA
zr;_(E$e@Wmy!`wH*8f__peL>Ix*8S_E~54T`pibObMR5<u<(WSnBPGi_)`7Klq`6O
zOcwd8>1|n4Y!slp_=`%d_Y~6AWXdO#G>USq@hQk&&NTGQ4eFL=zrBy#F2VLJhVBPx
z7%K7|guTh+mw|ylAv>KdeZK^J_iDTSrz>oCauX}G2sCKlt_`XWG5vU3<kDbJ@4@_{
z^R0<^uP#CIp~cVMTVoMhjF{ii>NSc+)~luNNDhNeX5`<q;!J`?h%wg}#LLN>gA}j$
zF!AvK&u>&Vq6I(%r5fSiGamP61NSH`-Ga^sjrQvIR}?@gD(OPRL_{2=q_VQI{Tbp$
zt=@vcCh$2bAy+aEV+lRIsqNXOmX?-eW;LR`Q5##MD)+w(%EP_-mn|oP3b8f%Tu@-9
z>l|qC@uO2xULXD${PIO&wO*ybYbX6Pe17hXmw(PSN{fLAbY)njL|Yr1MnD=^SmENr
z$=QV|51boBxVgERm=u{3#wbgu+B2T#wo0z={yjS1p{midx3Kt-nksBHkSQI4J2^e=
zu{EW|mYA530L@sHDFF=~-RspWc!T%UqK#1SG>SCu$Gj)w{MOmoIX^GX!&6;SVh6?Y
z>({RU?Wly^4Qgz(`m&2gM@OL^`ko(yI-GC)diVENXkYj4-7{b#MKro^rNqRj1C~M*
zsAg;r<zYp2_5Ape3_-fL)CCBKo+SKaYkF;GhdoAKE{fXT%xpMY7Rw@CI$eeT<&1#7
z{`AY)M)Mah92*Q`<XsoP-+uP&89#sB#DvbODuuXjb7^U5NJ!UYmF3^V(5rC{T1aBI
zV7X2?sow`qMY;c!r)&d_NlBJ{gfpL+%L*ojV9B0N<+(&siI`fCaxigKKQS(@Z9P8l
zOHv%x3(MAKyI9!Vi+kf${g-^=&~2zEJFt1d$4g6rPOkI&QuO1NDKB)Mn&$$xf%0@D
zVW@hgA37{P8wgzPwd=1r1nEudXlY9tPyb3FHeWAdye~G>?A(|fl@S-NCE2=JkW850
zt9Q7RoorX@xwmn2*?Lz~E6<S2@})ja$VdAY6&f583@2(D{W?1*VWW_%=XyaQ!3Ulq
zVn+DHccy9*jMfn2)8l3Da4`6=^s79bnpPuc_4sF7%$dWV&lmJ=xC)77v_WZo_~gS(
zlSlvfI4MG}+Uo0yb8<o_n^CKsiOC?WEP{;NLJ8__ox}V~W8>X+Jn`N%K|+N4=7foZ
z!_MMjrAa$le#9Q+w!6Fg_V)H#e+D{2EF<IN#~a`V+mSBp!L>py;yD0e@Z*Q|_Dn-B
z-aygQg2sjh2#MbAZmZ3S3Z?6BSC{8hl$4ZIRJPhkQ0=M!9~T!h57Sh@Ble~8L&=~1
z`qc#52qHQuiHwZQ%GMS_--xmcb~Gm^Cj^%Y6EXhT=%y=mMo`$>+taIl?4_%P-xd@Q
z5D*srtY4>cKgM-y%I#pakKM4@(b3U3+uYRA*}2i{gbhI{;Ap%xRrBNni<7Hsnz*0u
z^H!Z}><I5Hl?*Xa*FOVo0w(a|*(Q(A4Nqcoi;MqmP1nu0`d}eUE0SKR8UY$$XYc>{
z6M%PdbMpnf3@$EiPa?bD)kOpJMQm(W505tIm9V<Hljc_kOiWB1*e-5vvo*GfEiIQD
zqs0;LA&Fkg3?D)(rQos3)F^@_Lm>G0_#j(&^O%V+y}ZtwJa(Y?qT!H>yxM=8zkZk7
z0v;^Oy;mr`%znBSuxTRO^Zk>P3?6F&<zaYB7gyI8Vs+0x8?rJn5roPB#R|cr*&Zz>
zJ-GXk*Y-m}!P;<vD!o$DK&B*0joziDYR?;Vyw@k&m+Tpbev7XsQVH+rIxw$S-=C$q
zN>o=YAhGhk(8Hk;G}+8ur#9>>CD%JRMaJqui&40Rt5Wu$Q>7?>J=(I!4)<hewosP2
zF02C$OWRJwdsXpGaD?S$HCI9GOFLT!W2<J*Q8fC*d>9e55`K7LV-XYolq&gjtWq)u
zB=(GE=bollPe8Fm7WCm4xeH2I)HTDZa%hSQ>qrY@Rwa`aDSuSWH16`V;W55rVPC<V
z%=MMOClgb`dBoP0KX0Blm4K-&g__5%Bs6rA{cAzvG5PQDPo-)3<1a;xjg4J*NAhkD
zud*4ou$IbO+V05Xmjp5M`wDNLe{oMQwtC$WEUJxD+CT3t;Pur8@lL+C|LS0E+5Vx=
zzRB?@y~D=uTO0Fj*ANSSe$AggeNxYnLqHLGGj&s+_+3_&Qu5yf27X}Gmm7AqvMLM+
zkOT;{HO%KQ*Sq3;vNJF4v2F19@#9#!43Ss&2|@$kyy3T>;<5ZinUde+x{i#1otuUP
zfaq5qj-nRj;^b^_T##~f<Ullg?%jXzpuM}B;o-x)<Bpw~hBr&6S<)ferF!NL4xb0D
zer1UFFYT#(6cM!il?EhOoro8%i-2zXGoJCOUaeh`8pF)Y3;;VPKwe@#|0X6U2YGxJ
zzTL94v|L3)K%USDIDQ!#;^OB&`7@Y97Q6+e@@#)aTU%Q(51z#7cym%Yh3D1ju8E1s
z0_3&;<*%CuDA8H?{X28iCYcix@Zv{tKQWid6N7q3lZkSZ>-EupWCH{;M(h<jED4IX
z+aErBP^2TEr>D>Nu~_?+4&b5b^%2Ob$<P6S#|M1*{yq0eBHN=!k6K$>A@|#Wkpai6
zs;tDu!NH>uvokZx8D3{$W&Lc>0R4qfRkfdlRqutlIjdeZgNVpQqLFVhzXNq*Pl7z1
z2(@aKT5fJGkHcIVV|xD@zwH<c3rk63<5X{N2{bJpWpST>uKy0#Ys`H2mlX^Q4BXvo
zi;C9QhYOw?7`)(`48kI1Gi;uKtcB%5KvPc9E(Rw58#rB9SXgXa9Ag;f?c2K5R)eq_
zi%Uz-$_#Xyl<UPF=&(?>OVP~j9a<08jQg)%`L5(8=#Q~*an~8tzC4+@_k4|2*X3%&
z1dn~5ZT7;?$I<09iLvKHe*qFhSdCZDC#R<tE&|QO6z<m?EBZSm_NDgZ`FXjaI1%ra
z-dd_s+N^HXy&lHolW?m!{?ke&fX0*;*tIk)Q9##bx4QKX@R54({cq#q9FMqqe~d~O
z47+cR73<-sW{CV5v(2oy-ISg7-AT#L|6z-_%k^=B+1d`yCJ8<vv89!jW@&B|^+bj9
zaQ_22>`hT)+M+i@i;Az@j!vpb%JljCb}D+T8Gpnw{o9UsGOzscl4)-x=bYc7SAQ%s
zD^FYhGrGAI4>@+b4$Bx+Kp?&bF3nF1RA8rjdV8VZ3wWP$AZ8%Kp*F9suJW!GYJLI@
zWoy384_X1ViM_o&fk+3490)durZU0O>}*pgQlQ}|s;UBj%xBmDz2kx0do>0kC}|%B
zoJ@!Fl~1;3{{oWO*w7QM&G5T=HB;{dOa$HEW$o8PIyy>WcRRS=yzjYvo9`un1T7sM
z`C(W1^YPa7Gi~h;A3y%$Ip(Sndh+DSYDI4sK4Qd%H*yJ{`e1MG>})M#9-bJ=ETp|^
zDj)O$TPV&V)cx)44;UC2?%xlbe0IZTcsbKVz4*Af<MP+xy6EWWJuCZgGoPQNYvMP^
zYGr%-V@isiy873WLn?CepQaUQX=&7A-d5Juf_4)e2zVob$iJ>aDfzZ&Cxob9H!+f8
zViq<x?X0XY5RehQ$vEUZRx|aSh-!y<Y6R$Xlw@QA)<blGv`MP(lpY-%9Q++AdMY9D
z1!_AAGICN6y!4GqrTS6iu81c`<PwoABN66<2RSMJ2mAX&!^2cmRLL)i2?+_sGOkzq
zD7d++QmQc#<PrJ6rh4h0mTCin;(q=S7~jrB#S#R0QvaDH0wqF^^>dkFgY`~#j1RG%
zan(H$9;7ZO>x5j(HnzS?{r(KG1!bFkRp#1q1V&)T(vlt`7A-(cX1H2A=MDv#Py-KR
ztW+^ysi(K+UD7C|-vXN2@ySWaO9&=%9n|oAYjTMq4FQ3sBPt%#jR{Git>l<_oZ=Eh
zDT$E#dW*?Q^UttK_w?61Rt(xj6892Av9asz@G)*<zuD|<-Yju7BYb*d(W;0*d0>{H
zn2^CfYF+dE_2Fj?L@-ha7M%h)8F|glOcl9Ao@OBtrf$Ng%E}8Em6rxi?Fe~xj*n;6
zzt30rB*t!1FldF1jZMO4aJm#F{%LeWHk^dB3yO55W&g*sgEfG^4fXX1e@!Oh@D5%)
zC@k^vbY#J|Zrwui&&tYDW5{J-5*PP_0#_6M#i`k-%@;u%_v2G>ah`@XV1CeVw2ML1
zVIIcTH#9VC+G#5<hy7Qii%r4qGp(rBE7fE=+TBGURv+|Y7wjJ%q9OpT!v+C-Ca0j_
z#w)suk~h4b^yu%!#l>o0>iOv@0}*Dr$}?TvqCqPm*FOjZ8XB6nSlc5+f}*TCQRGrW
zLIPykEyOBYunD+T7}Xhw#@0%^oe-JUjTQ{37uQC48byDusa)00)BI$LG&{e4AL*;?
zZetf>V^d+59rp_e2!J5GNg{3SF%Y?uGNQ?XwY0U#$;r7KTmvMclaoVDfP$6Wl~<0<
zmC8c+Ll<iCKHG<SIypHR8XEf9u*Fnp!uI4?SU^B2g=f7x<^kk59|s2l0Td<f#|L&J
zG3%k+%*;$^0i5h=0Q}<O;@aEWLA1P$5Qs!aLu1pgW#Q&N7%hHw`R|B=>xDdM78*~V
zN=r+-I6H$xbC-t30R%4{9nZz@p&*_?otbU(%lMI_v>rjutN7%}lA{1{6lgA3BrLGt
zaDOqUMVVQ#$!oO804cU{D;}T(38QMdl8VLt9j4Z9dO_JKw8me<`tY_5h$+b2cV_j=
zZeh2BOQE3or-Elpqwgf8QQKJOqenJ1ily2nWDLu>+Bk?zO@?Z6A=>iG(&9C?=gk@E
z$sQ{gp<!VfVaO4B4~keaINW~4@=jYSQMS4qP{(Ro^Wx)=7(Ob+^zmFvy|YxKXIzo#
zHqF+Bw~dSC6BCy#=rOg7^gQspX+?A6zp0L=x(-;ehMg@cVf<Vqgs4tdJ0D`Rv$?+(
zL{bUJ4Z6D3U~>!%8k<aJ7ILT-Yw7Fo+6@f+>+0y~K+Co!wA`Mo)U(t19FhcT%h1pe
zKvttBcl*7?4!|E~!JJJE4Z1oy292)PBZca{$t|ys8UcmK%gg)vUIAbg6%`E+3sX{3
z!le;2(9|TNqPpB)d6JS3XuQsDQUfFiKtxb3k^E)uo0J(_TQkP}2n!ESPfI)Xz6v8|
zhDOT=d=2n1i0^>ePeH&93JMZ--*j|xQczOb94nm$VBzfSTx&m_KWOFc<1;%u3)Jrf
z$|`8Ep)&XL+p~`oUqMrll9ujI7k;Lx=?WCMxmg6FIpqxh#|tit_xy=H4;6rzrX(c+
z!q3djrQ_n_Vq^?Gn|7jn#z55jJJa9azdZzRZF$*lzLg3A{9~TEV$Kvv!qjx}{ax;t
zGxhgK!rQ-n0|nRz8jSUDKFEP~U;L4JUxApBC{KruOKH44*XAew=uv)sy(i!-za~)&
z07w2vC?5eb1T^Kng%=A70s$q8QgEohe@FbnEjl`y1713j-6%LHC|%6wHH3Lhi2rp^
zY%D>51T8`6)syuj%BU`3SfE1noaW|cs}!J42R{>-k-m(KXwngYic1m>&5kKXzPGOr
zx@_;#UJ^R_-MgF#wLl`G$az0O0tE*L0}%m5l98F2n}^4HN8#mkZ7JZJi3+piCesQX
z1|sAyumj3UN^+5uub|t(^6KmBdu&dq)8Yeh;;?)<Rc#Hucy4Y^AoBd*zpMQo_KzPw
z&ikA^2XG425l9;(UqnO%KlCp^n-OwmR#u*ef7Gd?#!L12AAJKP{QC9lN}ZAik0W78
zSK-!V!Ds*e)!I!~s(f6!Ms1fYiY|_Kj+yY!M<5t5W$sr@e3360EsFoWKKpHCW^sBa
zAM<U!3i_jb*oCX9hXWyWKR%RBUsl^i$mJ{?oUxFE>wc0eeuEZar-xSathi8RRwyE!
z$7QPitCD3_L`#l_Bqpj<#e^DDX-bPDNn#h!VZNv?8va+!2?_PeAusQ)GqURaV_xP7
z{`%co+#<ICdwcQ&?Yque_f&Tpn|Vg7nRs1hQYE9dG~1m&8qZl-nVI#Qwh|}f9#c=Y
zrS~lL>sF_9Skz%|7>|C@V?zgO!>U(1Xv15~xBYLS$plH+&Jo+;g;|}=Fm-yZVbqm5
znjsW<fCaOyK3?D7-4Tf79&LhHSzcM;j5zL{h1P;cDS+SQWM+2LaNYk5+;H)y`AUM0
zzx{J^9)z@qMnuduxU4~p&CJf~vWx+`w@Lw;`Qno+Ky^?9VF+c|?CB&W6_}bz5xE3Y
zFGv(j3=Ewri@vwF39V9~;sU7U9*(&NF-lbxP_pOd1Sf7t6=>K<C}^IZo@96L!p(6(
zE!E!AO9!_2@91wb=tyNCQE4){xw)BtDhK@5>v|`U*3Qn3hrACy01CSI#i<hj9}u|S
zy?Y0&wtUhK3k%BvB^sc#nBP@e=sf`evM#4UTAM^22NNFUkf0zODj^<rc6M>`HifuH
z&@QKIY@b24fd+80If;UJ`0ybc8yir3azHQ$r3|r4;3mL#ot&JyBB?02EqaX-)eQ_N
z@$nsDl!G_<>@`r9RDOq>(6zJMgw_aweQtEs3{TgaF3c|`CidXL10kW;5Z5&IYlDF0
zkx@`Adz1Ez6vsbvTmCWtIO_0NJ(dox^MoXPv_zMC^_^S}BO99qxd<~8ldI74J|YP@
z6O$|utNPA`IXPoGOnI$`-U=FOYg^dcKlgI{9eq#whOSVHbkArNX#z0qHAY2r_xAE|
zbDJ?m!rvimt$j-GfUdf<XVdm7HZ~SS0S^y$7MAMra$1b+`g#F+`mW*O19mYtrHnQm
zmX|}hKa4~`DZ2;h-VE@}NZjFiHe&ZZ6YpSIL%QN+S1GALBr|bXfvS`qnj3!TcOqFN
zV;)%&QsN|iJ`T>6G+ePN`^#9bg;0@?{@D!HS_w+p4~rPOZhy;3<_~_Om&eM~&Cgv2
z61S2WUgQ#4WQ-6&7bi81s)9<KruKwzqt%EFA-79PwxS{?D+@)X4oeos6R-5XmAyXJ
zUL=Y8F@LIsu%5V-e#o}|?TA4c@;&i3U6Jj^!UH0ceVeeZ_NRt<KZ}b+TT&B=NHO2u
zcfoQ}mo%4q9hUy)X}jDi<}KNB$FN+5*!viiU7YEW%UBq)HOE>SB9i4Pd<P58lK1GZ
z?d%x;r=iAs-=`EEn~Vd<t@!5OpFKTQ<>kh&Gkbpi1mjgr4KIQr;peAMH<HX0ecP*p
z{L0Fqo}S|TuExwvT5<9D@$qMm;*k7dP_A7Z5f$Yykcs4yE*_VZRH$9t(%hUxbPFLU
zC<wD-P%n9L3V<p4UCjV)7#fNd{AMsSI2f}|0<#^H3Y{JM;FQ$V)QpUsjg3#EHc$&~
z%lfOw_^bvH2q1~4-mO_#ChF>gAXxeM_#`*ng{xo&Wq<k<CWwPz2!p<83y7XjT37e3
z@^H<$+Pn+2VW{5ef-bA<SI0uTWjh0%o%sM|fXjet_F_!?&t%nFC?UNS%<eWJ?SK@J
z@mS#_O7!ctVW6p+A$GL42h~EN&%+8}W<`ZlpRGdd3uoubH*em^(?Lf>l}t)bhA3oA
zP_(pMOIai&Ab2Zf(igb}4Na%+<t#`+Q*{nEj4Xv4epXUaV$&L*91bFDlJ!0tD=Ucg
z&_DsxK^`iN($UjD&Qg=7BdE}MZ;}KHke*HzxukS-tG&Fsy85Kmq{FlVpn)cnGLZGY
zKKMXHc(@Bla2t1T-@Xk`Z)`YNS;+<9HbM^Y2gS`J)%{un2fjtFCk}2%`?3_`v>oTI
z?c5=D{<F+g&pxxeP=+8Cc+Fao{JWM$;RA(5SjZB6)!(M(u<#H`G{0&dR8&`&ob6uU
z70cM!*|{EH8(}7r-8&6zee;iT`S7|<b67xfLTJK<H&cDM%Xf0JW(|S$L{e5zVc7VE
zot@1~C4tF7gblH>H6wQW7UfoTN=gWd?BMD3yc!b&DL-i%&=C$qMDTZA7JXs;nv`q?
z#YeY<9}8=s>#^RxA!}r4HoT!G{wA5Du>z4^XMF9zDGNWuATl-7%aorxVQVs34HO2b
zJn6~Jtq658PSqx7?OvfnsL?5_69ps5sw-JWXh>F0*tKc<^5EF;G7$TvVEv7nQYdzR
z#*@*1eTm0KPhW5n$zum~tUVZ^Hq86WqDnmXVaaNhcKg`aaf>!T3}c%{yh#0a##wWq
zV(j=h-!-kM&p)qv)Tghj%UPFfRDqzy$FT8uu_*A%)sagqaqcxGLOwQLF%MYA%Zc&@
z7=<SJ{eh+pijRs4E+S3P1uPR{N|?G8rX4l5<KEuh2xE|8ps4EV>Xz!);bLNf3V98b
z8A*aE!Ts-8DR2%DZn1E1fF>aa+~}-VS62YPm4aFzra>D3F?DAktNmLL7E)$;`EEgm
zuh8p525wX{2tT+Y8#A*lh^^DL_CDU;N!C-4mOw-RUE0qz3s$}Ti6?%+0T>5hkC&Gh
zca{5%9JRIvs&x~nSKJQ57HM>d5`zY3xEj!R_~hAQM>rDHfHoLxNJ>f~At6EE=;`aD
zg0TyjDWLzJAFdw&%>DWECrs!$I5})t`jWXVZ`A7;JOhix;)K}PA(%K$W#BBg5AtZS
zKCor?lO3+slU)?hjDLnQ;Da7%EM(3BzBi51`^1<UMW<W$Y(}p|D1X<D$PZU5$}#~%
zkhbAjZVE3}IK9^TGRmY;aL#&d&2t@B$PlK%Bp(!5B08;?M@Nhqn=K!a@?z+t_LHA0
z@JnMNpN@%>DCiX65*m^kg{d@yG?dV#$3P>4RXYFsxou3-6=XgzOTgftnVA^~RPrDW
zU~AaM2i)|*SsGwDC;z^7kdvK_jErn*YKkz1SHldt&&TIJ*ZhhpA*{m`%mX*|2lnK~
z0Kqi|e`qjxDJAvoO8`m~l`t9t2$_?Miy7u8PSDgsWe5lfxj`3$a2evcggL?P*~QH;
z;MOfppXYb)-UVF;oJk;@`WYI|fi?jE#}T^gyLT9X74qGn<wdXebZ~4Qt`D>7*NS+b
z?gGCf#_WK|9<c%6!;1oy^zHe!o$c)_aDM<;!UoL1$M?aSs}$-eSY|nlzW|50+0~3>
zJ`RvL#SKQWiw0rT!2GPt+c=SRKLEUoU28O=9(6?pbsFlQ3m(#TuLa&ha3zB(C0{VS
z1;Mc~Qq(J6KDh~kBB<=S&CN4&b04jVB0@uxxy%)}x*p5QuB@#AVVU^ypRXT36qpjk
z0LnwK0MZ5{tSACtoC%rxa(4^_WLH;LL0%pwYmbt+3%ocCv5;s%e{`5{tzj?IWRjMW
zf&s(U=B9|rx3?f9gB=01DhSn%A3xTcc3J_%P%FgR@p(af`uqbaF*`_tHVF_40KOz&
zpjhbKM-3YrHu<xkOG<iYXMG^gl!WbJxd3uQ;csYY$jZ7mtYZnuYi+GCTn!3<CKC=0
zj=A2;w)5>KxMd7osL2G(?;s*ZYVGyYRZiB8^83lER^wW8PLWN>EbL56hShgJ*SbpK
zFN?Au)R+>W<b&617y^MY?B7BNO_JZ=*`^c7DH;(^wYUeicqY$^M|-mkpad#mRVE}5
z2b(|}1tG${j#42uKR;hdD$iQ)eoW!$2IT+l?k><O&SBeSXF=s*=Z5B0b?86)`}=Ux
zYMmAo6+6Dv<c@DV^`O!@3s7RW3sB+?lm9xT`?0d05^zPQz-8tKy-iLO-tA~-KBFO1
zrGS~8ou7ll_%$jkE2}LZ+M=UCFF{N11;wLa00boA9oX@vKmUdQGC@5XNiq~gl@tr}
zsA-310pyzoY9dfxs1mxBFTez?3xU@eNmZ;}9D;rKTiWGT-8`FKb@$JoVfpJ?#oD(~
z13^C^Atn8+Ta^#`Q1AU)(4_OMU+*vfkPgNc^t)=#$S?x@S^Vi!lH3!Rh#elf0(0Bi
zDji;jt_nq}7K{!MrZ5h612O<e7gQS{nG7r}?*W!fR<3MLR*|w9)Ej>d{FN>YU(=J5
zeIG@=;0_>Qknz8yj9ls!`I{ns!_$DwdN@0qYG;|RwvW*{=V|<Szt0blv-9E47nP8l
zwbsL{kS8~5Dg;<1S<(o038E$yf(jCgltpI%z&Ca~xb!SJBhb!X!`Ul*GUe;>tvU{D
zLJX%oWUCa@ioFyuan!TL1%$DgS$j#9nDcV?e^(!2CMHY-$O15%6*cR8pAsWq1zH)@
z5nz5v0dz1%bocO>hj0UOpRbZWm9I>Jf`SrDr<x|Px4+Wk(00kk%Ukp6U{xw>w~NN_
z1=`iw*%_ERklM>p&~Y6bW}v|RO5rVrm4^Pb4HXZ>1V~mM7@@$NcX!tjI1}VCAu6ZM
z2*a~y<6sd%e1V%mC5K&{E_*I)u_p@uo}xvJ$OJ{s7TN^3dLW;G`zOo32QW2W{EQu8
z1+8V+W~xyA0T2@yzQNr-io9|LMG4ANb#1K$(>s`W^4j+AOxNMHmvfqR@&Nu1{W>{$
z=5m+$=+0<w?`0G&pnsO)=bD;tQ&O%Uq@V@Ten&%%iYkg{%9GkN8f)uxGRRTinQW}d
zE%eWJl|7@W?!qTT%TV1t^J_!deU?{U4dt+)CJKNoj5C|-5^qW8X*_Rv?FMz8SHH=-
z{`c?SPzpKznuGc=V(7ic4mPU!9qa=y@ZNwV-mB4R#P>cq;(Kp{1#S$T!Z43ZVzSlV
zoY@PmyOQ`dA){PalNhMTVcAwG7M;rSbg@&+c9&U@BDf(>y{sywvN9PqTF!Rb+BCJa
zwE8xABIMdFs^-I=7HPuxxI#xM_QjSHWv_$f>O1XX^I+VfQJYe2@Df_3<THfPV#LxV
z2TzK#s27hCpzZ}cB|BO{S)=mBu(<UGK(YJQ)a6M`HK2wYyeH!62>cXU?csUFk{<(P
zEh}rO$Ije!#bKM4-zwD=YYG}$VkX1u{#rLUNfDz^EEkc%x7DS~wcgfD8zN1#Op3iR
zxBopagiCjHDo>nR%;%|{apv%4-z`A{5iS<Wu8I$(`o*Iej#jFo$sVTxkZimVw7^hd
zc*gk{+^C`=B4q{*`{y-3%M4pUJXM2Dhl&7}CrH0Flr|H{K#mvP(eMvS=|VLyJA&zg
zhpTIeuFy6f0FfJ0A{d<@DgA`&^ue*)*a$r}Zhhh((Z-MFb`Mn#Wj0i>;@WIqEwC$F
zEXr&>5TT9|{B~H(iGIyT3dq&enD@$&HEG`H)3K6$KKfn@;i<P6_w&^j&Ir`K>A76q
zcB^F800h`!R*h~iUiysNBz7N~UTqnjnBO4w)4e5Ow~giBzd>Buoo9cyR38w46w(ed
z0q$`8<-Ntl#iZMU@;D?|T~0}SFS{#@+>h^afV^p$+t$hPdmrNg;kSWzwC_Add8wl~
z`0in2&)c^%<Kt7}eqAH%e1w|#M2+aTz&$=%r3fU%DVzws-|VBmDf&95ow$607N5X?
z&BSiM?CSCGmjwd3&bKj~Ul;HNRCsv4y)+bb5jssolOu~#RU2s{Od?~z^z;twc~IXx
z&$iENtXB%I*r(hnS>X91At6!v+wFykiP`A74vd(9gv9-5L(2m7D&$5htTgX|jT>!r
zvqelaxLEd`Mv3#7et!pg;7#H0Spn1223Ws(C$GuTLUJB9$Fw=aOa>edZ{NOk1xYJU
z!`#9G*<UJ4!1YhGMo}7<IpzVgBvS$x)+t;gqJNE^j_$X>OH)&4*hbK;Kzk%Xz;xzx
zcR?D;0A*KSU*C%JD;XIXxGaoM;0a-<de+0<29{`U@lGwKQRwg>jJBSwQdl$<LWN3*
zkH3KFL8)HNztvO+5J{jG(GmOrM-&Xhl7hrRK7$D`%o;Z`33O@R;glcsjK3B4b>zRu
z^WdHqd(+8CjG1|l{_j?T%hyNwtI~22mT*`Y&+n4yaw;fpjL-ee<M%60y91A-?6=Z?
zDu#XYZJDnbM|i06*QAJfT_~-lKGVjrdwahkE9S>^T5noOugY^_Sy>sFF}Yy%*pjNc
zZOn}iPlxw)I#*(rrHQNHmt0B9EeN#Q9+Z*xXB3ToTIXJM+nj29IzG0(9z?1_bte=j
zI4YTr#Ob4~2saNG8g`S%l7?~9uU21Ho{UNYT~!c#U}gj(CTQ#6Wx}{okaFZAl6*vA
zePSLwhMD;z<&+mTHj3&*Px*Hb53w;Ypr3*;KWxPV-H2W#4Fen74y0Cu0gSu>jRObR
z2F$3XHOk(0K|oBLH)wTJ^IsoYLCpsU>EYplg7O9?wbl01AJ+52A|tgKh;AGj5HK)6
z24|JEgM&C|{1$Iz%*|K8oK~Z^1-&v!;S$;lj1ZxeQwuoK2AhxthjuIi_0G%9RZ8Lr
zXt02L0?Jx(7OYAtzxucug_bA1tZH2|1ucz5ulk+!go(Cdaaq|m3<OO~?n&aqe7f21
zx>e>rygbO0>P!hBFn}?m5l{hK2nQP*=sxf#DjJ$L=snP$pl>V#X9q`}?r#Xv3Q)U1
zR%j2#u^lM{X#$4WU=9N`^9Sa73JSNY(5wO8sjI8Qpjzoscx0rzv-6M@5A;00gH;^x
zSi7vLSXo&CjRAh5q^#W2)5A-yY-3}ym5$mfZ@?DwA=b;dKi~dbrJ-?pl{f~clw$uh
znka+IS;7)UI<9gkyX1X!saIW)cq9dH;S*`w)aJC!Z-Fzse-qP%8h4bZm{S73U8KQ>
zqzfYZQub%#9-ov@OgF12mb<vt+8Dc@8SJi9eZ55Q6vb_DSv~cQ>Gr4l(ZtU_f7|O0
z>4E9Y05uv~r()iPwN>|rj5k%_Kl`bu$z=X5e1eavakZ6|Y(_Mi{l?xpq(Ojpi>BV~
zXl5yTnv$)py8btu`#z|n=1l$Y=Un%qT0VC9;Ea^r&<@m5@JCdA%#)`B(3o8as^bd_
z3piN_mKgx&NxHiw_4PL+h8yk7eX}0Q1mG@^=fFS*qXSY>(vx9TaVjzIdcYOHYjmN-
z!l0_ms15YI56Q_zFlF2GfrhFNqBZF8&><ROss!BwzzuW)dl;|+`UI*AS5Inb5k-J4
z6~4fD;QDtgqGv^e$PLa#{Q2{zva%Ao90qD2;9Dq405%Rhn{}%!IHfUwr9oV<#>h|A
z+CNc`2P<U&3c8e(RIC5{)0*njd)HF1?-0FS&#`WF%$^>v%YSYf8XB;*ii$sBegoSK
z#%2&a0dnmh9Na<%<H-*wH(=kTAR>y-$Y_J7rQm-l0!#vAe((bsusvf+fZ5%m02>h|
z08}7@Fbdk-+5$sUjm;=iST;!epcn~KMuKq!5(dy0oVWPa-VSK>2I0ahEH5vE?+TU>
zj)Z_O3tYW7jGc?C@|f$65eGHRO9L^>oj)_hel!ZVB0vgJi^r{X&{_C1M7N>!jh((s
zwBCo6{aK^i?9i9mbTa}&yX%#aj^VDK4~YgcN3GaqTHD;Gk?2XsqrL9*X9UfA5|NTn
zHCT;m2f}}vpvJV^`N%@RKOoD*+G)_^>8gPrS+Qm}_s~#cUn<E+ZBKaoc${C-=fOe>
z-dYjAy~Dbtd|o_^MvCLY6Mr__4{SB%cK8Wtu8YpyOVPr2C$%VXAL~6lTRCl*4f!S5
z^y-hRgx@Wkd|0O9Q9XVA3^;2610E#`#;7hr%pjQFs3<EdtEl)wi}LmL_4aO7V*p(?
zI5l+|U?d)u&<k7JVl@URY%uw81`YsIO;|BVdzk)$Sw)_O<Ov;tt?gfct#A$pQW%hN
z57=#sM?qW7_gDv+8`K&=sX#I205so7d+9>p^$~ac{Urs%I_1Xt0E#|r+E|J1n<atB
zrEn5fXD6r7s3=Q0nOHhq4UPMWJsIF<gTfLYPlT8Na1R1IoZA=!B_u4n_5JfVYFqec
zP+1gspVW?F;NVbky}+ny3MwtNmy_#+s@oMo0cXg7NhTy{85(|+%ef0mW7XInU?4!q
z=K&$-o3;P?6{-7If-RA|<qmLIATW?ANPu=gea#T_X|$W<g0WVDBKX{F+=m9%T|r{#
zS`zg>b%arx(q;eGulIz7Q)%CUKb8xpwAX#XNCf-M$g|sh-WO-c(a)Y5P~Rg6$R*G)
zdf1y(hurmLZcvtiq@W-d%-4Di3zn?@;i>eICJ70ch}R=;X*Q|I-+ga?|K8+s-|Z^E
z{`m|`o?^L&^UB^SkH6HtHRo3X5>6%oEjc~8>v3E|nDrutaGF@xda(AAUKh1+gbC~e
z*eFFL7|72UiAJ3VY-sO^*?b!jWI?vuq?El6W*7o+n;LgB?5viJ@s*WMxejb}e>T}e
zK)oM@(**XA93Vjr0z$FCBcbd9PojwN7lcK3V0!v<7|%df!bU<aZf$J*#O{dhS$Tto
zV;Wq)Ggb2!KpYE+EqFEnN*-;Dg2x4cusAg8L#XG=tS(UEf-6=G_Qa^(aRJ6S2-<u1
z7Ft#aAaxDfe5uIczo)?r`u6RcRH-&N%)nnpaKpvmD~ZY<e1P`G#^5YWGO2*g)+?=r
zQ!+5j^oF4+sTwaToY%Svtf_I+(<84sA|DpqgSil})tMOs1Uw<MNg$?RcGR7&g_AWP
zHjIsF7YvV5+dv;rfymI&!9ZvfRf86zJL7N7Y<B#_#G;Kum=R3s7G5SGLIHR19!aK|
z>b18J^h6BU)PRZymlR6hw*9l=7p3(^?zYn)Ij@fHouF5@xapOZz0&PWO%0>3#}Kp+
zE^D-NtnF;yEP0w-=}Y^^KV#QrsSRc1MyAxX$+Qlm`#SNRMQ}kj0afWtP1i7At(y_k
zA1x){%74Go)Tm9{n=Lw5B}e<Ot6oZevAG74e^F6Gc4G=1lGX41QyAG7+~Px*x$kr}
zO^M=P^4>K;(R#ETNg<XjxL8wSm`GvJNE)DfSa#CaW5g~r3ec+yLDK|Ps~1_P*Gng7
zaCvoANLUzbnzpvKG&C700byjpqgRx7P-Bvl1z|n`><ygJ_vz@~PAVn!S&V;dP#;G1
zft-W}(4Wk`1~l)-j~|EuNG1y^WdsnQOVCxoP6kEX-25Q|PM0wyC~jRB3r6Hy^QHD(
zgn^W*Q52tV9U=ErSNGLwZ!+KuP~(H%zV%*Qt7L=!wqU=Y(F5eN?QJ-L1a;fU?@}vI
z1BXV;8^l_WPeCZ~gSrLPBP~50AlC$R4LI2WJ?drztg8#I4zT`=8-Rk<)qnH#86pV!
zr>?Fp+&K9I3W7*>G_7~R5e~`-LpcI@R8Cfwli~shX*B>}@RyuXB7gZFN(efwFgR6v
zdzC0CC}4FUe(=(4k2=(tjC*sfpRs@39L_4uscKLEJHAPoPBE>AO(p8|W9CQldT7-4
zJO!skSZZ-cris}zwlr#rqtoMvV($N9>dOPEY`d=y2^mTyNs5pRWy(y(2q8&C=8##D
zB(suI#>_*S%t@wXP6(k4nUav1kSR02b)NT+?>m3IPj5NLeeV0ZuD$o#Ypp$+BOv%O
zr49GQ&rGhpKw)<|ISrM}hb*^shG)Nw@sT5QWVQ#Ihvrn1c2i)j;bLA_)z|14pL%8U
z_n(7=Ytu)UB*|?i=gKz);-}9vrWKc~rRdaWk@zZo>{dNJx+U;For_nP<Mp!d;O=0h
zHl9avtfp%BbDqaVBJ;Y4YWTGBCP>dfctFnlgg>DX*^7h=;nvE+LL9xtyLWYUbjrk<
zj5BmPx0nT|!ay+C1fd^IOAv&R6cV1dTH6{ti<6%P)-hNei1eb=c|`3$k;@E9PVfuT
z)Z>VtHE_hz4<8IGz5Bj?bwsX%%$5bT7d%xQEs3C$85_%>GDX(22zn_e=jU(VoNztP
z#fkdmHNSr03ecw|!b{ldU^tuQ_n=&aT*Ct)X=>bN(*nxjEqauS0HP4MOv>&E-uzty
zA;;+<6{nvch!kw3OCW_a6Hpl682J(fGz7vQw2F@tTCld_m1qcrx;X&F(DFz~OB2r&
z$Vh6<S65a9>^cw*I-6#;p?U=J=&>}Z6fun28F57n#5JoJu=cN}Hj)vfrKM3Z=Ob~3
zj={=|3<UEZPXj8*c@Fe+cdq?xes|`oZcKC}pF#N&*fHJA_J1FS(<}WdbM|gqT-~4g
zl`rdgYEvRF9i`&Tuh=4s%~;h-JSn$kw-l$380qjZ5ek^vzBetdq^!fTq_N>kqM*~x
z!0PbEj?KU)_FWksQHe8sKGv><>U?QCNvrPLxJ)xAc1?E|c~yTw)#QF&EHZ{6O9yHC
zza?s6Q9&S>JSh~kEW3S;B;Cxvudt*x#q)=^PnRwZJAbfU>WZK|gvKGlab`KmPizi+
zJJ7U{TRHWW5CJYxhaUp7gLnv)Vt(0BGZy*`>l+wgGkIXY;DCm1aus*XZqh|W>eWs0
zM2IMnBj9jH2oPeaMv?_mOTC2<1w|nAX8_9&GBBXwO+<~W1?<s1N2Z69-|)SC+*!78
zwG^P>1$lW$B6P8CDz~>hj)uB{@j9yA+1UwnQ9xMu58@9lbT_lOmcD*5kv{xN*=zDd
z{%%tf#lU}E6YQGR+nJum?SI`p{lD#C@=cXDhIPcFp;(nCL*gyMI*q>^3V7+w#COy0
z><~gX?8QTv3uu@3zI&AF*T`;b%Spdeh`V8sj*NP^<F%HGDI=O)5ByZr(=WqJZLsvT
zAPv29i?xbOvT3e7A3Mnon(F&(iLAerQaU0Oj>%V_>u&tJ5dXl0f-C4tef_(_!t9=_
zD=s=0QU`sEWYhN!pZ7O=PP7}?W`1E)@-I`kNx4*LVmt9w*wd!qcy=a{jEmdfYDJj#
z3HuH`*V(!)l;6xx&{5qQvtXtujMvE``8A?fGO9jmVrN3O`tSyGPWcPTd6VZago%D%
z|DPw=>?*&M!UlVK`&Pf)H>UZ9=eJgBg+xW)mM00#`=JRPCpeNE@7t(rS1`W5ncQZ5
z%hS<_<}Tf%vrR__>eF<={QPq)#{}Ba+<54N4GlL6n?JA50%@AGdKCPb;pG>efEj9&
z_Y3Uo-0A5NR}(5LCz{g+Mn`o|+Z<VajfM_*P)kR4LazAs0m8R+%RqI}BYA>ch7OVJ
z!zl24_ZMet3zhJajR>CI>sePXjP(z`d$h5STh;bRb}+Q5*yI6$fv-5DFJyx(#mDzn
z=8poz!7<PYyJP8H`*%ZLrUvbdzMnsdS4d%*tAF;qTVoo3o&e1aW)1n3!%!^}_VVX%
zCS18OHB50oFfg#RbUpKO*5vayIX$Sdz&i!3v3P_M7g>Ou7r&8~S^S+<fXOgFc(_DD
z1{xc0^Y``M5fNh8S5)0DZlBn!;94VFYqgh|!JA*GFD$fhDerB*Ge|`@u$B?5y7V8J
zBp~XZ0iynFV^l7y&2trv$fHM&l%oPeOW&(b`vQd6Z%it;Pz)ix${s3Q6)~9q+iX!x
zZrA^*s_iT#p}_dyhtC2ZZ?ICf*eq4P=lZvOdigQ$feYrRS5G<{^1B^P(>%CGRYL<w
zunN*Rq7FWvfoZ-U>@Bd(g*Z8h4KbhvM45U@bxsZ74D1(EGqb&njN=0XAI)=HroJM;
zJq!-^n)@Aq`+|oAV(8*N_^kEh+N7|(`1qwsu72#hYa{oAz2aX-pusw-aj7H1IiMi&
zxInc&^YbN%?}6l6>B35uLi&ZDx_Sz(o|iIYq3(^0iaK@W9a6rkuC5}SAWbv8^_7Uy
z|A0KA7Fk<eMW^*&`KjSWN?cliS_*Nqqh^$suR2&sy&AaxaV@y}6CGJGSy@@$?BTqH
zs&ST1=Q9teIrMpQ?(!dSmN4WCO`VvZ&x_&T&&K|Yp;E&v{IXMjnL*z1?v7Uh2H9Kd
z)Luf2zP@#~_Q#yv1ol%emT_Lcdv}cpvQ&BpE8A_K!zx71fdKJ#&t^a&(VfF2=gDFh
z8c^Zk-@xaiWOj4D4Q1E)riJeM`s>THeI$vS%P}y*z?!C<cU3)q{_&G1o@jR6y(_77
zF(f1e$%<dLfr!HodI9xUtp)Zy^9Yv!j@Lry<tOQhZyEvTBk1A7J9v|tT9EQ98*slH
zqfcQqPfbj${*nm?T`eg$cMgGc=>U)aIQ5T#UGf#`?(Xgg=6m<<Mec>Z6ME>V;?T92
zo1gzZcgJTX3>h-S-fU>tg(xb8tr6gKFx+_YKf2?Qqeq`bMshrDz{3SOvA4ixXljag
z-)8eCD{!aLn@0GCmcq2}>|@p?D&j@D^PW0IBtATR2o@!lh=8Er+`oUsZu!#E=DACx
zoo|hh9r*w!xNs1DB-i0UQa_fKmlvVx2hI4&{|J1Gi;MOMAIZtd#l@?YA50Al3;;bK
zqM#<+ZhfT`_KozvUlpd7g7-WDK=Jr7@5z(!J-{<jr?UT^-s)3!2XBEw1uhZm7Zvc9
zy-aHNSeIf9`7Wrd6W<aXe2R+;MiW>ptH3NbHa8vY?PYYip(z24FacV;x)N4@&k)p&
z@{<Qauue>5MlK8r=)dW!q6w&Ke&8^G?*!Nhp~i-2B!jx;y|p@q+wZgZn7#fwQ7_&q
zi&o=g2QTN&(c$3%@tN^)MmoBdJj?UI4*y)@`z`@pMv^8PMim$I$oxGSh%QUx6iOF?
zje#r))wRBr6{ki!EF;jqNAt=avCkuaNH!chKj6WGt+lBfS1ktr?=D353RfNwR(W_P
z!3UU_n1EmkTnH54fEgG6{vEh6c`M2`=q30rvY;MgOLtlpQ)<h;fc2NBGm`2iQbGJy
z+rknAM63`L<Np>H0X?k+*&<uRf`{P+<qqQFcnU2*dbu(RqNto4C*VOK5`nH;@#-G2
z^ELegC0&-IqvKl0PWj25(9RFP!Uwx=ZMXw00#{F+^R$@Qb?xUxMQ)otLy>~WnxWi9
z;$;XMOBF9art^D;Wi#HqdGhGdE+hje@;g-kL&r<F5b1r3-E?bRz^tByha-J^R4Jp8
ztqV@C1}ohn@=ixmMY=Nzg^FaWb?E-&FDQb7d-G;4k-)}e14&Mk1TuT%9RP<~ozg4Q
zba-%#0|4(t1EEs<6ep(@I@&^U>Myj&cJ2f*P!rs7{V+=HWh9SbdtRchq-Wf}|B{v#
zD?Jr11|U{15Oh)d?!V<vu|P#o>-+d@hlSCVpItvV`YqW6eBF(exk?#_fscN`{lRH_
zlbPv!`?iexyu}&yY0CM%FFtFNtzME$f}I9Ykb|5VH6!zGpzscmg8)*`D=gg1KL-WV
z>bxbWABu|AB~Sh-a&U4Q7dszkWBXk`1mZ2en|KI{7+(4ywCwkv(EP4LVG|FHM%IGq
z-q)JM6KWfVE2wVp!&q3@Y1GvMSi1lt$an4p!+&E?5v3To%V*hIYabsBP%uMx6U1jC
z2M<R?$IZV!c90NZ$8T@GJqQ>Y;M~=Brd7}7@BL4pQ9t|%FeQ?6qLc}3h41IeD7D>)
zemw+V@T2PgdleK2M{rC}Pop1-yc`G0CFZ*AwOlMBV)q|)6cKX+uXfZFM1sm*yNUK5
z(2gIN;Tvsi$~xf4Yl;;#D!D}z%OWDg-6QT1=#qdF0JWS~Js6~jBMk8MR8(AC!)qdq
zI9u1w!C`uKmbmPZ?`^&kX+tw25NQSI2*i#EqoBXU6=UPAbr~gWAD$nGC?kzA+kZ{a
zp94D+#AqZgdHMNZFoZBm<e@{2!2TY4){$L_ZWHb`t0iS#>!Om9l0riB$WftY#D~Lt
z=CFjb1ximc(yGc*&Ie{qL9bAPj(6p-w5Mk3W=&m&VTt<!C<((jzXP||pn8Xv2;>Wg
zzLKRb6Q3jCp`q@8GJ_GwW^^>?zDJXnZO?^WAi?)*)YH<U+rNKdapDr^PJ=5~R@T;t
z9eb$ZU!9~|0BHh}8yFkla-jbCQCq8&)}Gt*2hFCSms{_%3wQs-ZN=FPdV|kx|G>a=
zckX<EN)t^3RXAob*P$)xg{^>wMSLni;vXoDBxw%GPcp?@`+3FzfklqS&c+5I0ydP5
zt?ieSbVEZ!*UZi5Z#G6Wj-YC`(2Y-}pk=x>^qH1$1vo2G3R%aS+mZh%MM3@;k&V}2
zIdA~yD!^8I8=E+&X|Wv8ki2k^!+#uB74<Qo?y&Lkg{b-+IB=lp&kDMjh1RVvL(UaK
z=c$&WO?X#Sw4Fbf?rH^+jg5^B&hAzxMoR5(Sni&`(1F2%1~A8T@SvWywz;Y4z~EqO
z=4D=4S=sM>I|(@^m1u_yfCKm@JA3H}wf}cJM5APuV#8<HzzCRl+m$QzV++U;W9e58
z<_}eB5|89x<?HD4Bk}Y9?q<zJJMXTQp##ka-XI7SV=*$FZyg<lxPjn*A+WE7ynQC=
zz45`2kBAWx9fH0QGLNt=9aN3LL4<d;wzgs|9jE-%(7?^g`rNWH8q^cKs{;HE{KF=y
z=FQDb;stt=V>eA61YoVL9osVpZXn|W2}wzEGUCg#o>WjJnEO&H_yah(CZ*`eR;8bR
z)_!fQa^XTKk_~JWBpa#`s7U`oKr8AHUs@^^Sl#M&D>)g^4IEld3JdGt46eUUJm$-h
zFFg8Rj$&(@sr)MU{ri6l3)BcE2n%y})<R}ZztYXZNtK_Mu9*h7w-0ya!R{lOvmfY&
z^ZjfCRg%7Z`xY7$WX|v~yupH~==xQ*ilhe7qGcv;TM0)q`uT74^9iQNVQ@Giz|_n7
z+XmL3&;vdPu@*{878VnD*r6dXH3fKo7)uQB-t2e<o!iV$V%!EOjK&+7%n&gQ4iCSc
zeNH#*Li`L__NNUp7D*TW<7_QpYvF9t(b0jw$I8k|Q@ltcz@F~z54(SF{6KwKY$yW>
zA76}=(=X{r>b=9sumD@7_&3vcK4Lhz`P$RQ5$N@h-QfuX)kdjLB$G(H=T%l#rleRN
z30y1WK6PrRFCsK$fP$ObGKenV%%BA5?d~3z5+*(gEXi~LVf+RK<46$-_hjm1;!3~^
z!X*LBgpdI3Qe<b?Dpc82x-qfnDLNN>qFy1OBTt9UY&m%VYx4fc-QktzE`G<&Mz(Ps
zHV42wpp&rjW3<Y;0?y6oXeU-SqLQSz__$Pejt;PxW(R)$9Q<;~T@^#;@POgZJnRI%
zL+}x)e~8vZb|leHD?UPD$MpC>OYXJ6=bhO}iA!bIc90Tkm6QmkrmW9oowk0u!!knn
z(oXrULy^0uxR0n#NRfRMl$MwnLBq(HlSAkn&Pn}~|NbSFLUMQKr(6CIHxt)nyK*Dj
z<gB^y(qPm}E*`Wq73(y?6JE9*&8SPCT;Q$67P1!iqUDRZ>ZEs9n<pxH<~QSmO>gJZ
z@zxhB%RDc|N9r4Mw_7i59)D~m-tv9=bm_BzUpilS;I8HJGetF@`}`hC+NR+Sp^q|N
z%)$h6X;CwqPfjzoW_FiNIIS!W>ei-2)1xtK5VT9%M1ZE<_Iu8LlRIng>30P<yBB)P
zwBbGWq+$#%(nmBBBswR^pZf3u$^hF76shpWk+qPNql3=HIM)Aw1cpO%=}4m#mIMfY
zICrQ5ez!;Yo5ex}4W0+icxy|`r5RSWz@@XskN!AYKTr#lxRPKH7vyfZsrS={3C<{-
z9XpHTGIAmyZK}`=x}bK)Y^d8}Iea12ib3*}zL=Q~Z-=|mWr>|5w{<^N9Z7U&_L7mH
ziu%_!BqotU)l1IV8LiAFL(y)+x8V~j$Euap0WaptDz1)_m(KcDj)y{N`)}Q_wdQQQ
zE=(Y30Xaj$gH}8dwM5pA3~%YZjLtIXLhx9_YI)mgjZ9_~bglk7(R@aCL0$z(3_b#d
z_|fFItPvdT?d>QH_wC)AduQ%9Rw)Vb%iVPx$)K1pcI^DcN2`CbC46bbvWiemX`1iX
zOdd}jz#K$P9kKs6;7+5ThJ`;QgZ9-4FpzYp8Clz>s&gi^eY?cb@=YOCyLfb9iDSo(
zn}<z2_9myW`(>Wszr5*d_K{8Ch-zIqpNdNJ;l92OBymTF8ovEkZy@zTEI3X_xyYz`
z#tkYgq#H!(6DsGmAfloc=P@2lvhd!D6Krhe|0|T%|Mfaul>A;9p;lXw^+>+rtK)4>
zHId87CVQDrblaP{$fb~GKX@|~MCo_UaQBvg<asD*BtKHuO_dyd&7R?<x9>^t6UmjH
z3=w+Oj($D0N-)L#{_TnzF}vyH&G@~kDJ&;OK!@itvL+-jf<9ZG<z#=|LHu3?$PF1x
zN(}8;lq+ROkkJW~seIdi3Relf7Y#MF;)K?A$WOJ=d8O%uULGCS)_*JdFnVhI{O#v?
z>j|s2&eFGIA~GS3kG(FfY8<?f9jy@csq*;n{!sJ0LY~zR#xr+!nPTOkWPoln!dPva
z*`fO#(IN;blB*$~(nZ`D@La7BfhIp#f5zViC>*m@?8@J0n|w)i`7vj(4%6lytvA~s
zHCbCv*zDgdx&NlORar73NaNJf^1Bx{-{sciBW*L4E=~D6Up6Z~s+_|`9|UbApMKu!
zna5k6I1h<#5%@-~hBNqkl?aI2t{#cGN@O=fA0juQ_04!pCpLkRa^)mdVVM_Wc8I;T
znH<S8Jp-edkx_<5YZ@n0&dy1#<6HN%=uiF5Z_=pA$^SA{u{b_%4b>wc^%DNIh1#Jx
zlxiTRT&)`c1|6G_Fm?0#B^a{#`;$iQUV2lYZDY$J7Ztn*6dkic1(~9B$0_F815_`3
z8_kBsYu9ej)21qa+ShO*QSRS$gwd9}+1teQ2T1mjgPE<?u>0G6@OthQIFP3(o@tMu
zPf#MPJ=i=UKtoB|H9lF8tMiKeQ3*@=!s@l$$+al+`jPn(mdLrCfWDG%ZLPC>64(qi
zuM(h1slH1X`Q(!la(cho{q1<;XC-TYY1buQ+Aw8N?6+AW{eNC!a1?_r5tV6WsUIM?
zvOOu)-}Y$i2-*KY`F`#$GkXQbL;Dqu&8=H9+Awu9D{$4|KlZ<uUDO3>*1so_G4sjO
z)tor>@>#V2%?Zj;cPrz)eqp8|F2!f3$C6Y1O!q46B_WW>-}u0?Sk@~1u!bz=oIsxi
z2p~0NkCj>$OOIcN-5B9XGbcXOsie(%fSPjm*sP`^$O&qJ$t(=ikn(CW%j9}hzYpf^
zaxkeJnX9hjiJC7r3ne{v^bkYI%_S;Db~e$qo(EBlyM7mK1e(F$Hvh`(FM=>Fnfyaq
z?{$Bxit7?Teg2N9w0|7gr_j3XBeKK7(n2kZ!({r7x?dO_<;~|dGAv`%lrk(n3|khB
z@t04EzHLqKf1UEQQ1|EPjk&}ovBP?A3C~k5NBnv6rgAga-;D8bfKlZg5fO=i#_G6!
zv1P|TUVdK7TzDwqzfv9>Z*;Rxo#f);Icbr*HR*3C<2^oh=Ea^_>Xu37dcBED<+FgK
zba=9h4`b<shtK(i5^3PbjJ$mkQzxKA1d$93!3-TJ4_|9%=w@w-@AZ-SAid$xnNZxW
zJo~fY_t&MzUHr*$r(JdGlk8@_9^WkPo*IsU_x9k80gcJ5se>0T(hEeX&{VIGdT!3w
z>U~XSpxQ++&7Ygki>UVSX-)4C54pzpd34}<XtJ7j+{YoW@r3)|>XOf$Gm-J|tE=AS
z|J>Skb;IU@qT-5w&fL`HlbkQvUsg=_m%gf87&v<5XrG5E>*qG(6Td!qKB%s87V2+G
z5-rKgORV+QqaJq76`SD3?;MzW)Dq~X%OAgsakI#SpOS24u3FZq_uc(()np3NgGx;w
zj`bx!ctEN2LF<112hY3HZ9Qyz?Y6(tLXXQpIVWp(y6z_Pjz4Q7?HM^w#S?QD$EfF{
z)>ry3X=mI5jU45ZD_BKP6u7%f5dJ{nfSiZrP5?Z&wg;uP&_M<tY4i)w0OYjzUjQ`V
zQEbQh$d#XKm)jv+@V<Oid`s2yXZ7`5w@(p2Tqk#7^aGhvdDa*SA+Y+uBU)=Fal^MN
z_F4-!b|`Ml1Ybp412Z2EpCAxU3xAHJs!i>n*5OIX-Sv6!cPeLh?v7HnY99yF`+~tv
zq$Fk*>WoZ#4f*6RejM3LdaP<_zw&1?6FGu!z%hpHjh?DxvBQVVq>m1W&ow@>J5st7
zuJ`mLgPFAowj9iYzn{hAf$C#4!CPr+YqfNgk6<i!;n11qxk-Ib+|M&7{-mOICh@Hv
zc~nErb9{Pq(0%?3`~FKV*$*t9QCuSllT3nnv+fQr^4<Ne<0q9!Lj&hl@5|w{EA95y
z3aE2&JbAK<{hEJsl^_i{&7oB`&b-%WzYbMR+l`U1&N_%b*r8O_-V*lp%-Y<}n%^D0
z-m&=w2FgJ-;xXspcEmp9=8%b9d=_WK=MfTlI^+!;yTDGgKFi2dZfAbr(Js-z=dcK)
zvlk^Hh-l=&W_6{Wnxx($4}2RiRl(-dqK6?u^`EESh@X2ZAF6PyZOYUVK3e62(?N6*
zj*o{J`V-n>fV}|q6gu{;m7lI$$v91~8dPJhsQC2LhHbF&2|7w8wtvl8Vze~D{y&QN
zn)s!B=6}@B^-mNS@Wq@PKrg-<htvSQl8gIUGw+eL5tlRNSLqpsbsJSGe8u0bw+vCZ
z-IDBUE520bxw(0@NY-OR?xNYR;KsCkA??qa@zZ@jN4hBXcJU<&i;A!aIhW<T44tTa
zz#Hl8LY=jcwtM-IKvScbY3G}r7cVGMFSwitiSK@YZF#1g;%CL}7aD<lm=L3K{L4Mr
z*=COf5zi+2WQL=06Yj7DfA82Ou`)38`Tf6Y<qOYMR?Mv)VM52ncQ-peMav%vW&Wv1
z+H$E*mq+ded7)bEptHN@$y3MKxwqWEc`M$LJ}FzYQq!93$`qw}E-wBmrBe6Pb9<=*
z%C3ff70c;E@9a{2mJe;Q$gSa;)MpwuXx2rByGq6Qb}79K@7Hj-Vawc|`dCv;_P9h<
z_lL_mXVbC94I?o;Bj!^0JdasZO&t7ozFxkr=hl_#m=;35USzvG+~rXlu-dN?=U{Go
zf2cdSu}gua&*i2|m#wh!=TuJ_i`V66+;8#Azx(X7{mOF3)6-Qa&XmQAEm=FSb!Uu{
z_}%{ZkN8Q2_Sfv3ayniX-VW!lIj0#2co>8}w9x<)5_|zW3e}-?`s3BPJ;>CF)#6N_
z2ij@$N`PNlTUy?gl-vbY1C9oK8?L-F^}|}xOaoPjAQv%=MgY2ZI6!d@T(`37L|LF5
z%p@Tt<@BQ<11$`cfdpTCW(aD<p4Jt#UjaQrECDdc#l?ks&mQnOoPk&n&`~jk6ECnZ
ztiU7zGtz|Stq(pPNoEf$>4(dYbQU>Sx*t*v_j8auD(*fSV|195w0CLd$fEEW%c04g
z;ES@W9(h3Wz)$&=)#qkYMozj<ZSIW@M-pS+_6S<ti19BgdaD6mcTB_U*P439x-WHk
zCG0~K_+X-OIpPZx=Pw=}bY5<)aiSo5Z<CzJcyR6Szs)qk+|~;|J>{(}^=G;E0kIkQ
zokU5R7Da_N#7TbMD!~#pHvLM^kXH*DQ=doMD-K7_^W=sXP7hz)bd+^B*4MAFn6WtR
z!mG&M{zfbZ4ci~CwkLA76go*$Y?$hDKZ+i?z`QE)QMR$ZyTEn*Ykx?Rx#L$eV@f~5
zGg<0w_o+$mdx6lo`ba&$C&$I`n3nC+(cMfkoxT(?-;VO~o?6BCk`m)P<9_EHhiJ`>
zlhdyA9TdxM4!??Rmi&D`zPC(Cfu&>FE&fkc#V_tXwKLyvad}LBTxsWyz~F}FuQdXc
zUU%xcyM@!eD59ut+Z)PEmfFaiir>hy8wv?-beDedI`2tpdT?*&o2!LOW7Bq53+xa4
zn<{_lFMU`l@XeFoW?sE<vGID1nc5L1&x*Z=2EJV-r#*bl;o9lzll7MqmgDvu&bR6C
z->l1#EEf0lN;*(~aEe_l8Zhej#ztNNfp`C-(tdCrp>pv=Yagg_Q!v&61SA05;KVm|
zs{|NtviQwU9UV(pU?3ChAb@~B_z+Eb5SVYk5e7gmcp{fBQORS-&e-TEsjp_56sjMT
za&bQKFd=}skq*xpV8`g*psfK~_S5L-RNX9qmPv7OJwW8ZR7?WUGqg46`h<f!KGkEH
zrNq_gZ}gd}OICbZJl2H<me<BqcBG5&u(My|J4kUSMmlarjxaFC`sd>JhH4Mr)UKS^
zWziF-c^7DvK7DBD-*n!{j0@-}nWl(&Ycx67@uw*zpTuu^jfUBKaKmQmv#6Bln`a_E
zwL!GKO-<P}w84Ue7u}dqBH=YNf8H&bJ8B-oD~x(Yzi!wBWGA&}@OC{2VKTgE%tJ@n
zeCqJe;+y<3qLY*NVm&TY2S3r6`SroZ-aJ#A=S+EgU|rqH8U<(NllCe902T;uZ4Z=N
zcZ{C$@K0D>+k4WpIilyWcjA2#x~3bPG&*X<k1)L881shv^=nrH!>(U+Q)Ka6n7$ML
z(gyC_c8+Gd2&5J7-W1&Ft=KQiQlWKGci-X7N$kC%Sy`{`!SZGg|18VPaPJ>cSnz#b
z(Di89$;8pxq21?i{mV}qdD=^xy?P#t>z_Z;>2Y&2R0)X+Ui);3hu*FFWkqs1*Kvts
zyyxzYuK0OX={f!^cHQ{2Nlu%+Q=xUss8>UynMGMCx%AMp#J|5aY`rh-af&P8nVi^9
z;Ns>{c@ipex(G8YQgo*l+?B&Zr`XfLEItg3eHb=bhv0~)#l<Xa-2$IZ5;$1`Ej49K
z)y{|)bMnTy1AL8W5s38!{T4rLu&{*Ss&q^8h=4~R6&djI7cX8QYk>?xSU><Emb(Tw
zTlmB3K^<LPQo`ffANcv)Yjsdp0RF_kE`U5m;`{jVd28!&{Jr#<GZ4U`0JMZ-1T4jN
z2M&#or^jrbl4YRTk)my8)?*6S%`<DR6Ti|*-CX!T3c%sKmHwY2ootM$pIXZB4R<n3
zkg&+_vEOZ7_|CSKhW#3tdnXQ#an2a?KRdVYWLTqowyrthz=)-@gtc&_`Ytkgm6XfQ
zOhHtQJYUWyZ`}3y;r0rGbe7T^U2=qCiMX$CHO_^w9M8(MV5o^`lf#&nbY5Oz+XL1j
zOhKvc(p!(m_MRgBAzpIDOdMk(VxkySd3aYmt2QhT`;lv>Rad-uk+$7`td2?W_R46V
z6klJ--HqRy7!Wi(tYQ;yl`kPq?i`R<J=jK=;8yKhr?JYH&%TE!OOV@<-oe8cdVz%g
zz^LnFXOW$rPlnE?kvZezpA)W%j!Rh+cKwx{3qanKn~|%$L*|r0`JJslKOf!mqVVed
z-hP;XrEQ=S)_Z+`GK%V!WVUA32sN47ah=Pm$d0~OX}=EBbGh^LPm}t!{rQ+=6m`Dp
zd($TnqEfUuPt+_i2}D+tK-sUzJ`@xrbnal$pC^mwmYtUVh`la*!?7{vwQMEB$yp}(
z>l(GZOh%||<mZncO(BYK8Eq<dU$8<y5h0J30G3tVkl;0qm{V8Dp{jgRYhL^LGpr)R
zZJ0p(ymk9FIdQL;n!+UrMs)w<tR(?t{u(RzDwR?+L4)qT;g+M~JrK~`;CV68mU6Yo
z5ekBIkYR0p)Ym^;?Si*sOmuVw)cgp7sGpI20ac))q1hPsxntC3E5i7Io8CE&KGPD~
z*qg<ybU|OvKMp{dCt&oRDj+e=0Io~~-@1E{Hl(FhNZy!VTfHg<fTyMST;+*lZjn?}
z)xNA(7(;Bn+j5GSF}{;9EYwDhJm}2B+XxPE`PTTvgQfG=Ua=oN6LOh5>Y0qUG@n3W
zoNXNajsfQn1~BCBAdm|aR3n5n^bJJ5cqF7;^m`ql<{mZh<F&qxYPhAMo!w!NC?-O6
zCP&m=FG*RU@K|x5$@(t=)P)8g1;TibojKl9-(B-DOwZ_)hn`uQb;S2U#@JYWY4014
zU;0)fIQK9WQpGwP;Z~zyjcN2WD0odmnHZ+mNJc7ebMvCPr*m6Yk%!xx)A{d)o#kHn
zr5-ibkDEAN8&JNs=fEQgD=UWHPX3!tqf>uuUtQcm2;<mAPzzwFyUEWYyAwP1ZG;L<
z@RP?>3%`@=VwvRh^=~S<bBCBAhj!97nIp7aim+_URPGE;_}~$Erea%EYG_^mak~kU
zDUWi8x*1`Dfz!d&2+yWl=~`UGX4UC)Y{*k!mBz|SAi(8Uqa7_E&`2}l=I)fC<pTB+
zI%uE_CqZrtbt9-1O|Akor$t0gojjQ+={gBQ5?JXlPfU!C{%jb>N(lyMW~Ob>FPuZ@
z38Dc*^d%yEyqTy93ZIaq<d2>n{gVvPibA(@2gw+RDp*mfDcW8&mF0Y9jA0zyInf=U
z2*uZuZtFVineO}H)*^O(R6Se;rkuk<+%3W+tWVXf<41D`z7(6{NACS7jL=cg7b%$i
zd77X91TXAYKrJ2V5aJ9KN=c8T3O3M)Ftu_NKuUldB361bfiI~<gR!-=;Prjey=no8
zM0RTA%k)1@w?xSel9Z3-d=_{DM$`~SIYepqTyHTgRPHNPG@bp|3~#@Z<;gd?8Ie>H
zs<sl$0-}N%7c{nBW(E?hMx!Zwezg@{Yw13wSn~c~f4SunYM+zM;s-`jgf)f;q@Y_}
zVWL%EGy9sKR(ad5@4a5#%$O$go6HC;0;wON?k1^RT1Tc1)xM8SpE6&kzY7mG3ia-%
zndrEyGRudLA7!Au!F<aAmIUFCGUA?|zl221UR2oLKH=$7>h;QF!nE|X_M(sZF^@uX
z;6I<Mc7iS~ljZC3_wREjdX^Jf?@8{Nk-aheQPbTiKP!_1l)N&RA@k6;<;r^|f0ln^
zJuI~!Zig7Nb0UV*PrjKW;!EP6C(m~$btF$wjC&FH@$TJLj$>nEF#mfon)WY{$SXw`
zR%W;fWHxAl!i$rlqu=AiNxe+dS{mAOt$Jt<Oe3^bCTC_eFCyx60)Pg}ta4EY$@LXj
ztkkyI7TOjv5zxqjjE1Oy00g(TzAgsz8uDG-7y><<M=qiQ2MI3;85hDM82g|Aq9GIK
zkF^N(tfmzVB!6~wb$$BepJ75p=8xtn1~vc(xpb)=Y$NoR@$RMZ*6NQRSz$N=1<_ic
z@l>T&mi0ls<+ugCzjr(%JeL;?jWvRkB?GGmQ&Y>BB%S?L?4NhEC3!459pcE=ZaGp4
zHUn*#PCI*L0?)p~={mxy+EfyqO?9(><9gwK!~;g_f{$TAXHRy9fkwM<wstG>)6s<h
zp5xiAZKp^1Z#p}yjc;jCH!7bV%dzd_^Q|6KO}UJTIfL)U_zc=>`m)qCrApnKj6cNJ
z4e=h(seTabc2wNX!raQDN%ro{*Kn`iRZ@5$xsb5OfAG*)zq#vy8P(1+x4v_@7wTlJ
zIZqy%x6QQ6F&6Xoy1Dr~<M&JduYGI(jtGA$%usYc(3f&<TQ@t4L#X8Hth{&j54Egx
zly_hIXk^_jax~L==NE3uB%A0fT}Uct6j0}RRj2XH#=>0p)GK(&Y<#c|PO+>myy|vq
zyshzb=j2)nF9uHDAXSO@edFQkd9fSC(%B)YHgC>{dh?{bRFVtRmcPUiFtn0-rjAM2
z{>jivVt^f!$or@-)+XDX?cRp!5sNQ7t}-Vk)SP_~oFILkLzM0@t#o<Po@XUDC)EE&
z#!-J(Pvpyf+0=AObat2HOus?az12ohE_x&@`%~trZMcolZi1-!+!Mwst9-z!r6~CU
z+!jGV&0+)}v9_9;VW$hPY}Q~`!A5BgsDOZ<2!Psf5q%>h?At<jhlf^IJf1@@C=C@-
zt-0%XOBEV_5@35+TUn74&<UMxc>zO~FJHexN;2J<eV70%6?m^<DP03Q0{#@`Snl95
z#Vvq|yO)J!_Rk;mr?-FP^Fv1lHj&-!3w%60Fx-%JoupsjtUwPURCY`J+GiS>WLZd{
z!6qQU3<}0eo@mm+f<f~3pOu^2lRncceO;fGKO6Ju+D-?CWLYp64b90LYjl)(HU$|f
z>@{U<5!Rxwf6alZmEt;jPd$O8Xq{qT_4H4b&*qGC=W6<av`8a@KI9tP-nUx5n&*TL
z?xoxnPoCmmL%2bDkW$d9N+g?nc;4+`Ort8zj__EmEct9U31<JgK-)vA?Y_ieUs;hD
z+9iFoYG?<6i~jk=r|KI`Hl8GY80NX)cgOIz?v>2IV6OVCR@uAnw;#V-DeH0%I+|^y
z6Tjzf3Z`dP?}XD^OS*gtn+=n#@!s7rrnzz%tR%kttRI@<=*LyU^qxu|mEVyb`C3Jt
zI53L?mnsMbI0o5#_chYrDgR5ex=#IBzV)-$5f-WBZNl|gsvmuu*=~^DGAK!X`)zDL
z6D`68!+y2n*20*c^>>x^X}EX@wGv~<4vifx&$WDY&r>OpCD01f&QH3{VxsI74mJU1
znH!7(F^Xj>ry|ujUX_Tg>U~?Q#DIeLU;u471Z%v|f~^`T3s#nvJ>A_KL`&`MjRj=G
zD7O|ugU~v4hCnCgr2c$Y8hZP{0MIT4cOH65kjr7MyM=Tdc6@8+YP}>u2?5EUXe%+5
z*Mg}!#*odz8r4;Q+{50%LDrj=TR`CN@)?(=CzOQo_Ov}N5bEF+V$()6q=vzZg~c%H
zMv~|F5RpB9pe$)tw9SWWj@L4b8Hw(!vg6-v3&X=(e-^vgCA05)Q~}qB?fJBh{fy+2
zNA$C@7yINRI8rWeyPQp7BO%nu8@VWId@B9Qi*d-QaZVOAJK|F<^ZK_AR6ie0CiuQg
z|8cz2FmbiZwx@Kei|n)EyHP4z5&hFtxwkhM$$3V$?=M0Y(6D^_V)kaxc)@GU^_d)9
zZq=xHAI)pVoXivf8%28O((ZLN@L#{M+S$^Qp8gJ1i}%9xm4J;t#n_D+Lp-pcPqxm-
z_S3j1>P|&PG{90WC}Q)y_xI8)!I93N0S5fJ3UdKXfo344=<>x}V~kEpf{pzC!-rwy
z4+F_qpHtl2`o+$c@H9h6!p&X=zYxu6wAF}CHb^=_OF%aVA!%`LZgB1n3L}^>7=i{_
zVACF(da}@<9Ap|8rWV<D*n-gxf;Dg;bRn~K57N^=Jo^$hgnY!o7R~fAPoL(S*9JvK
zMgn{MnkcaV)CbfDO-)Ud^+e$@d<h5xpq`+u_tNGr(Vc`o549SUXhGG3C(&WT2|;*=
z8V&Lf!HJ&6#wk30XvW^W`Gdy(eol2pQTtfqXOn9ezGYnpkpCIfg{rC-AZr34fCc%c
zTURkyOfW06o%t<pyk1I>uOQfpSdI-yPXw+)+7vJB@N4zzAOpjw`ftxYr}w4Jux~IU
ztw)-*y1sq_gxJwhI7?t@avncU0<4?JAtuVp;AjCN5)_blfs^YEMjFyQ6rblcHDTy;
z6$~u`_!wYEv<e^j&%^dL{!Ai=j4tHpnIEvK0M8EVc^7fm6_IoZ4gg9`j1Y>b9)#=H
z0EkG0zbF+z>H-V`Fk~k+13kDB2M)j-V?B_?8{X~)<#%17;Kiv;+$QhL6pw}?@Jka`
zCeo5)JB<z>PJkviOD7ZCGaJ_kJ9U8fc)|p_0|!LlW`hl3Cmc`YjLbH?on8FtBMP}W
zY7p2he1M{KeSLds3ibeXxTT|xQ2B1TT7f6}=zA-v-;Krlxnv~5v}!xKqPCw099mU`
z_xJL`LjL>r-dh{RnrSZR4q#*wn6M<K2rrBeYHMl+#Jyq0eTn#LM++L*4Jpyp2LB8U
zrVrv&5?|dcT+R%V6vD?(pV-938LQu6-@i^xjgO0iF}1JFG3s5rCa_&yU6&x~0S&1K
zx)iilkBHxZqrBr!IOngg%w3C9zdg}*98?mk+8?OF@PAT(?%?hlz}LxtCpDJ4D@rst
zOhC2(xkf*R1YoQKvLQCMm+-iU8X1{8CMCVi%rs|Ez~~q_I^io`IDg(_Z82l(*`GDd
zZxdQ48M@vWE|`3O7DY*b3!!=!yT7NPz@N<r8W;)19>D>o6w$o<`gJjw=pZ|T%~Be;
z&EU1*^}jA~%}v7>S^_RfQEDkMF?#6a<`|WJ9$O&!2nyDJP85D}AYfM+cJ}s4ad2SS
zpCEdqRXn_}@Ir+HqzXPhq7fiOOAQ}TPZs9qgWv-1GVFx^<aOxx(qW#fCK@sLcsxXC
z76oZ|`S_w6ERaN+tAW{KraNHHvk%7s>_cC2Wy0taJ!PzX>{6?>P+=z8(`wa#NWdlA
zb7ArNZzrbnU=6wBQI7rtVGvyd@M{XpYpJZ5_Fsd12Hzp8_^Or`oEqp39a=qB3zwIG
zIFaol_=($nfn6Yu^0#qrej7WjD$C<bMLcGYEon{bQPj;<_I)K?4aHSJKU@Y@+QtUF
zG|;TY;0#+!OSr6oj|U?Y+*N{Hinc081iE#&O;!r|85!^#4$w@4NggOApz0twp17%#
z)d8jE9EeMci=H5?E5B;BErh%YB6JXC_dTj<Z_h`Xj6VbH0qG&Xs0g<XM-&P*U_XTC
zT3onYpkKe%=02_Stl8>x*D*>qX=!GHg^<3o@&izSAa{g>AAM@QH&@K?P;G4ujEwrQ
zG>LkBn|3Rl7lVU?Y~d=eTF2+yw?1&}Q+0I}A%N{JxIm9goI9v(;uek$*y-}bWwT=r
z(DdpZpD8C9C#-wSIO8LLpY<ofsRB-XJW;l|NKnHpH7ViBUT7P2<)qaN@P&K&Y3Tc#
zvE(g981nhRUN4aBE)hQ9)k7R6k%O%jsEx6m?!){o!gOT2g$Kkn+kN+wxm@kPd8|@#
zhddo4;lZFVZPR34vbV=*j?wMgq7Tdf)-yahi^>6Hbd<P+fyqhCPO9CtpA!_+AKl$J
zAI@_`AWmQ!iPd5<sJ3{)=RPj(kkUmcD%rxVQJ}&$3TZ#q4(`XF{z}=9Aq?#T_qw66
z5jz>N0KWsr8h#o69$;A^=<q5WdGFs>IFEe6X^$}g*PNXCp*Afx^q&490&7bv^4CNq
zyuu4DSe3X!vW%FI!xd6x_#z;u#fXSD)X(7PLb8ZT2j&GJRE~pe=-_~p#u1ww10jy5
zR09A6E9+tE1V~dji!&f!heIcJ4#;_MfrWSzMG_p;2!A)1`!B=_3|8lChy|yxh1WMU
zfT#QhQ3;k<n{bcg4gy)1cF&%oq9O`kIKc|+rzIf*I5?J}-_X5za}Ftx%<I!E)$ihU
z*1BSUje-vg67K!`WQ6oXn*ke#UOc=r&vk3;^GIdl2P<={D;IS|EEjzrwU4>Zb|~^~
zzx+($J=4uxR`$ko@`L<B%p0ek)B{SN$|jzk%x2mu@T@(UpXYw&J3vSr0)o?V#KTL!
z(+9^Y))vT!xw*N>&~Q>@yn1!b#wH^;n6|yU8x&rgigT-;y8~S?O$K8UAZ3Lx4U~63
zIRWbyR=yZ&0+C#T;U2}s?xv=uR#vHng-ZxdaT8`ZOoT*4EQ5%y0BR}ob93-*rQNk_
z4pHFUJ7ndeLPECv6=z(CGZdIaY`^1~z!~FpN($2dSFo#WYO+TWu!Ee2jw%3GZCSyX
zrjgJ|gsWP#P-S8W+1%;|Y#@a+N=+UhVq)G9O*N7G21AV8E1ocPm69q&s6ZwW%leSm
zzPfe}smW5P)xIqa$(oyGP>$hqf#}02f!~D#$sz(O4!wv53mjk)x5nuR@UsVNC?7|$
z`m{Hw+JwJIi4ck4kO9@|b)dR162R&O4f!MwPdru@G|WUxMgVu#xVuQ1jDX@LG=Kf}
z4a25%L6H3)M+sIeK$>xmQTnT>s!l-hh;SFHq!9<5_y}B`9Q!N4hAxK<foMWGsGs2b
z!uh+Yw$>8&o79h$o7)DP4f%1u=Rz}T9xggcs1Gqp@gHni&|A|;mchUWI2pLYCR|tp
z%WxTqHDVKY{>x^>aGagkPHR1zx??v6wz7FK2_9n&`tFh|pYdAyGm$gX?0I*2wklDM
zj_fh*>yWssnAY_{^IUF^xYE6u?J@bh9<q>oy43WwG$VCa?3wH2vsR}m6N0ZXS{I^j
zC-b`|t~{x-J22hr$T0ukomcF?w~s$;8LvN)R8sV%IlQmT`F(Eo=9}c`A#wSO93eyY
zqROq$JHpUq-Y<CsPN>W~nzz5*d$VaPbPl9A)6H*tVc8Wy$)Re_s7#0J$m>2kJ>_`@
z4j*xGy9fg#BbQx%Vq-Qm>>^+n69s;w&xt0e0=*6BR|`GXAeF;OOM>AVHAHL3sf5^A
zfqrMgM~*%$5Ft0aP-ctF{lp_|?H8k5susbzC-1n+E1()p1vU_9MsOfE#s@=gKum!#
z2XB-)HRJs#8;qg12UIhbZ8_%!kG#~fb3A(5tG%kq?~uO!>AB4vn;NBuLWbbOaofd3
z3Posh73vKXG0Z@$%aL)zvgmdD1l$Yq@t`>m3V|evk`<c_ZfkiEj=?PrWN8{k9R4^f
zJpd`fh0xX(4jvuIgZ@o*&0?#7O2(l|)GcP~Zo_N`WM?$1P7|<^1U*b@>gwzYe?TCG
z$y*f&erfID96i|TV5B}_mYB!ofcjr6V2^$W){aB#%UZCt5U%ij4T>Ct%|O^G?|P$u
zNmrNXWr=*z<IWvWOo_zmP!`#c&`=iaT)<BFj$0VjhJjcJ_C(JgC@z6{%iR9M*v`0l
zs7k;v#UF$A2=!l7Mo7yvgD8A|za&kH?wNSRswy<45v}lIU~X5EE^kI8<^4Nm^5N<{
z*YbA})HygaGhg?_^)Qm3WRR!3uXR5rD$=UF;nPi6?9$Y&ke0DlvYy}#to_u@Y?z{*
z8L1W?3vN$lW+WY%ZNjF9_-^hT_yn+@*;$7lQ#3yHRQGa&g)=h+UyP~CyTm_5md7nU
zW&F&vE*e*j3<_nu`H&u4eaxEhA`C7K`c8R~Y6oK09RrO){f>K<p_3VV;J|%01-5XD
ziiYnKw+xTIXrHjq&QMWTIeu9wp!RIGjMZ>{SadW4J^go#6$0NE=Z#=HPngfNQ$=Ph
z!`QOu5HZ<4%hu4`OdO<%o7-c58&-wtoE}a`0wE-W!JRBdrQ%>we+o5n7Y1_7_LUYv
zZ2az>K)8yIkr9R?5aS>g3<T=nvV$}j!9gU?tS#ArB#$S$<0CRwqN#c4Q6Pr5LA3X0
z<<v}sl))H+uYLRW!SMxDzhv*V2FNLY{D{w}ZiY(+I;&u&BQiIQKE$E`8W^9SpD)MS
zjXVX`>mXK3fD4^kQjSCw_04-+fsBkdv?wt2#iYm|Kh}__!#}2{#5D_~k@|0&CzR=~
zTuyoU`h(U<(K}Dq)TrqWZKcaAP6f?_SbbvOkYIT0X6ee3<LsH|k4+5K)n|JDaKBo2
zg0$tT-SUfa12`o!9U7{l*yBcP-&T5W*{#|`H!>!#``xWyKaV;{uiyF`_tzo$lfP#!
zqe<zli=m>=c9iUFdL~4^L&Ku`l~VqRY?L}P@<%V#(V{uR#l9s%Q?0TOFn$2rO=HLR
zx)k4@$%w-8dCvW~(yGF-$mmE7X2Eyo60E$W*2q~T<>+eOs8$zcoAl*edOYJG{fR|3
zPxu5|HT$}&?XS}E<>!tht>NLRRy7UmRee`Wm#4pb-UyAppjh41{D^mq-<8^4GW}|!
z)4E&z*ZTAbou*_#$MrSZGrgiyy$xc#?7?-4hvr;9{dru3Gyr)L%uxh9j<Ih4==Z$9
zO&^3*16gM<#>)|pG9>N`NYe1|Vuuj2rH|(PQxil~g^ol0X@+(NJyn1q-!dKmMEPb9
zoCUU#s3IA>R+59l!65r98tFVd=08qDGv6d^2@tIxx$XIwZM)f~p;zg7XS|&*&rWzX
zR)l*6a@;3Vpi#abm{&vT&(-))?F#vk$dA7J<oj9ojUHoXImV-AN=74jaxc@dlU4WY
z<1bT&g%#%ii(k}*#H2v>kSEJvv&XFyPuI3CXIx18w)uTJ^U~Nam#<FF$Q%%PKmlVp
zcrcJv#mLAAhR|@+PWpY4o@&+8TTd@WhsP+_JYYczX`P}QWsQJ3$G)ZIW%Tz^*X$s~
zL+yX{D&ExF+Y7%RNIM}Ofs4_qXGM>_ri#k*bvleITJdx9f3YW!dZsU<?oY^s1plwh
z9LF-J5|=E;LAxxPk_zwq;_^bfhR5yIZ05V85(^DZ$#!+ZvGvP5wa4<G?$sU{<~m-V
zlTum5f5*g3jQ)-O@V^k`XqfR%-=TD?WV%#w{k><|*vHnkgf1Q@<5KN;qlO}i<>~Ez
z-%B<^E2PS19Tlkwf34@XG#%DUmSx)B7cws1r9WczxQUNkey>;Zw62ls*Pj7v6~Cgb
z6qR~<65Sskp!ycQzRXIhTmH|yy}UOtH0j~tB*WX9)r0<>(W%=%=4aY2+i_~f&xbnw
zD=Jtl7n+TWU3r}onk<s=<W%!AUsC1TOmFeyn2MXff9`ytGCy7Ar4gL8>fYEj>a$o|
z^~$xQ%ST-Q)6m>EqUNMDr+@VuX}HSIJIkVWAJoi4wdlxHRaEM~eoZ}Se`qUIy5<NI
z%>zuvhQEfPp)_{=y(+onZ(qM|Ew*34OvVZSN;qYp;)Q9?khnJz3+T9zDb?U0APzr*
z8;BHCPm+>sY;3@FFt5tm-57o9LW8920+@OzD&B*LQ!{OS;NBj1HDR!V?pwnin5|s8
z^z=VjDLs{vgTpDKOjs(Vw_1ZZgxsbFy|ey)jCM?jk8iTdS4(->#~4^WF+EK*`*xp4
z5&>laEb4X#vO_I{7X~o)<XLj4-gS51MMj!?1-g>Nr%x}T7Paf*^OO6_)sU;mu$v;%
zM4C6M@w&`*-uqYCE3+Q%o>fm0S9N>k++yAjj~A&$-R+7|Qsr)-N=b_e&1tsUN$}Kq
zH#RbsJO?L)I2xs7FCm|~JX$+l^1uh{E99(P2J?MY8>ghkz9sFIm27Qn>}u>S8Xg~P
zXt#PM>ykOXaAB+9_9X@L*oO4bC#oyGW9wsO1$WL39?MV<F;fqW?7!EzvuHBQ!=uEi
z@&lQCm=80bZE^p~PipGp+7p|fr8>lBJG%_|rEhpnzP=TpHa&HalFL*yVYIA(V>UGB
zzS7xC^sd7L@+R9KjZC}+Il1*S<`<W!H%4boz9w#J|5_7^*gsTW=7qKIE$+@EEX`E?
zXV&SC$K+z7J2gY>6Y>lE$w>i$qo?k8*FC$ZI>dQ}JT}gpu~^!3w#8I{jDjKtuc#=E
zIH7<}wm2Ct$@?w(bdzIaM35Ia6~=v_%5}ecx9!M3%=$qlalJVnl{tLB79mQ7-xp*p
zSUkuwt}$|kt0Z2n08WQ!3o;>p+22q3-((ImF=3<I3dLPH8C`4sLaG8D!kA}rmv`yW
ziDxa=!l)ooG6RN$Sh;m-2|mkdTvQ4w9zsx9qDVx+9u*x8U1}RuHBm~Kd+i)#en{f~
zr`Kb>zxd1?8e}c476|-7!U%Lt6Qev9axZTojs4}&(@U3BA~vp8joutO&CXV?#HP9w
z%&T0^$I7Mihvdph)@YSuzFy^h#=k0_a$cBvPg2@9^R72oS<ymWnjps!O(}g}G^=Cg
z9rtBpX#!v3=8N)drRkAZQ&SmSyaANjcLv5kSC(BX>=q2H?ycMN(w9R{Ta7I!Z>ztz
zW;>y3&^PG(*lTBZZ8e1(Dl1<d5{9WpVuw{KKMuqv=y{YB?92Gc&&o(4_g3o|rI3Ix
zJDtq(CsJNQw4Zkbn@5S1zkl_8!;e-w9;qu2-0!i_;p#X_5dPV&7oEIkH?7#=C9Eo^
zuC7t1)s;rY_4{mdAAR*xwbU%B?j7aeIIfc7oF+%LqndK~SX|1jFfszh>fQnta{2Os
z@_IZReTYTG&EUbT1o{=y4G6eiDk>Kmcp(=BV1pc>ZR91w5?ou?XDd&rIs>jX<a5Kn
zejO*)$Dj)<d_bT>b5KN76tk5$jvv2B?}{qzhb|<YpkYm@b;GwCVE+p(Y<g5mHRf}u
z39%L+%OqlJJ`Ql;YPN#IWC^Td0OBBV!ercpW?(V6POE&_J-;X@pjTkTaAhQ@z%Sv1
z3zwBbgX|95Kqm?cdZk6L&XUQgOV7N%p0A>!p|UA+nR@glLz;o{0F%3}%b(SQ+kYqO
zt_JN%HqQ^AJS@9usCVE%!pFgR*e?4Mjwbi_v@#uf`Yz|j_4yAqEc}sOKg(=eraq@y
z*&CLY<sl<!xcNwjGpeEb)$gkbv$Cs;KI2Pgaw-a{6uE*}p4#W@tCEIsSmx4nE$@mU
zImV%?<vw9}L*>KI9UQ6Z;_hX4><u2o?lAwoZ+k7q&Ddz`eS`Iiy`H?4u5!<6JE>n_
zLwXvQ+Q6|IlCUrIvQZ{YP4Vw@Zj6OHeG4P=UN9K><|>w)IR8h=36729g}kyoZ$!h-
zG95@5xY1pA-Qv;l-+RQIDz9X^pLW`>X(w{e$F}S1M*$)Chf*+0zjc&1>dTuqC&!Y%
z)|i~Y?%I=n8@XVsL*U08CVH-*-MZ<FWyk8Q@{?R;0f^l?a-2X=h>ea`W`Xuts#XvX
znAfwqp^@R?mJ<{9HrKsR88Po7Px;mMy{n5iN{wEr|L#;a5Wf-?7{9e&eNtH{vf=x2
z@ZI_<MQu&UF_o~@?*%hoLuztucH9mValP9*mi#*9!k1?Y^DHJwSz4KwPN>`3IWi?k
zw#u@`vQkp&=gvJ6&2n9xEOIjBPF`qge=&G*h|{7|$EbPxM(+c-*Yg|4mJW1WJ6G|U
zxjVtGe|p9~S=Nq9>7v=*)`a&n);5lpJ=%JP)zjxx2E+$EDEEJUQuvhB(o?x5{asp`
zWx^j;2@xZ<r!K{1Wx*`o1@3*<OW&Ue*2up?PIK4jtPb;iI(E4$XPau<E<H`t*q-~8
zIP&<Jp8MSt&DdMZw>Ry0`&-@@diuP%)A=Wcd3QpGd7S7_hNaVnd-Y_BXu{=;dBfbh
z<v)7QO^;NY=f{mF`H@KPF0X#EN1hI)*-3u>b9DFLE<Ox$E@Vu2>LWk<w{=1+LHa%$
znbHlKI_BGZz;{rg^lh*R`!ak|)VjQ%=Pw;khmcWDc-`35Qw)1vP%2m#^-z+=aMy_P
zzHo3_TKs33-(&T(m0#|YB?$?^<fwG<NO@6cQu)lWVL3{JS+@bQQ5ME}s~z{g`VYo}
zzTzpHZT7-#Wn7MI_u_JX5A*Jr9b7F=qK|5{BRDS73m&4ytX+;#+g=ZWz-rn0uIGnM
zb9-(lMAePG;=;JM$Ch$DF)t`_llM~9Ql%6Wbq!2#M^Tbcee8?&<KF*rnf?e%5_2f4
zT<_b{2~VE=s?a`EXYrAoeC^EH-*pG~@AfKJd_ZBkxceLfR}d<iWwf3t{edfPNiEsk
zA;cLz7{cCxjd$IUX+d4UL&IZr0{$F(x8-~y$Zks=k5-}dkNonQQ8|pG_8=vJOqaUK
z{Du50{~OG5IG|th%WMXH8_VyhUn`21J?#>r^I0%Kh0-syat|4ynkz7Acy5*S?7*9K
zt2+RDAJv)%k&0`wQsM)}HA&_6lF74NC8ta=*m>-ozUrPA6!Kw)8TUV`2ED#w|6ZN1
zTEPEonD0kI1lewi6z%gO$4`_>70PV=O1AIk3ID?Wirpssda=S=oyMEYu^MFlFBi8<
z)=32fXm%;TlmgbL15>Y&nTyi5$4?x|zCX}J^^z+>Gd6{*jl1FT)9pp~lFO1_a&C6#
zwC=xr$a~=6x1`mh@+~KxC9b8pCOGI)!KTf3NUx)u!d_6CZLxR9sRgNSS&7hG6GiQ{
zr<IFqhi(<B&d2oJr&A!ziuLv7yYTw$f2B}eG|TgQUc7)*;KAO3u3Hm$x4@Gd%IQ)T
zEBy-lIL@|YOUa7GHPI;)D?F6&t*#oMrj_6d%DeJVwyeVW>xLdBbBszYCDlm=t5inD
zqohN-`<nmul%%Kx5$OAz`)rSK2RYB~@2u5xTv4Yd5KI}l>F`X%HN$te1)tv8cWYwY
zpV#fm(O+`n`}V6@GrEmiY0>ZF*!7^nLFz1n0yk9{eXX-Y{$Ylou<9`h`HSD_dLW9{
z`$kV2(|DxQLHK7tpJSScm|=s~hg_~8^LW*gL&dRaV)}7K-RCG2<me)6&Gl4`;&-G(
z{yn8DrTIaE^Qf^L<)T%V;hzWOf$((0xa7A*Qa_2<nNy0yQ8BKdk}mXS8tNTNlPP^{
z^pYF=!ka#QW&fnmdUTWEH?;N4T<-_%-+g9`opy|-f?xK8lHYF`WG%fL98MzCb(IC(
zP#~ok(>kTG$Al{g)8<(WO=g%t<XT%>`#HY%s(}G!CgLuszY9HEuQ)m5x=`F?D80{K
zH2sZ3@R8#~c}A1bcbIk6!~WPB9Zu*myCb|}_<Gbh=3ywg`*}krBfc0I#@_@S^H%JP
zCpQnzZ|7o7dHERV(ytd9KuY+>yi^kg4T%`)Ded1!bN_E#!Sk%b<{!UEj>hbg^4>$~
zN5N8V+~4Cg^G$8{jZMNcbt-}QW-&+CT`OGso3>@7`7)=Ed@$&h_ELCP`uQEYXK1qk
zwf<m7sD6yZ3gRAEe)lZ#UPge&KnmFP-^ewJhlU54Izo7O!dc<h>2KPj4IRCBL_8yN
zB3TKV5lsa&<bJ-j3S5M@-`{tL7L?2Ie!V())TZd)v#oEM-)kSNA3wLMKzSpPBQB1K
z?7nXR8yD$J;e=26hcl<oSldU&9IkUgMT_<r2KPu@o;b914o$*nw1FM_B*knganBnq
z@}QX3<GBdP3&3@Atl%}EpHHNid35?FB#eJ?#00(aKNF=*Qdh|9r-;0n>dJKN`#M7p
zy+-ASs~65?T9S%|q~|*gOr0*e8FXO(?~&|%bH(=KZ_WLW?IiOh-Th=-@QSzNpSDBl
z`JCi+0Us)Rl)yknkFvvd30*_@#a`w)kbhye#zN?_p5DO&2jC-EvHZ)UKq0&Iok2GI
z&|oomCD&Z(KlInr^F7!Y4h}@pISx6%DoM%7dz7udwH<5pJy5FC=bhGU?IMsbL`g{}
z_lWFG>x;}tYqtkXnp{C^6JuX9rOK^%tIh`A^9%c(8-Bw_^=?^?ub9sF`@8&CCbM$T
zM@BIgw^3E4MPF-`e_BWg<evWvgaCX06$*vBhXVir-+c4UfddEhdi~>$5eb3_3JO}f
zbg5G!f^mmK5sM9TFx|79K*;B?xSXLFF54Zr)j<H}Ci)c)8yz}c@V6L^Mx)V4l1w7^
zFL-!(Jk}uWU&dfC7>&lq)JuV)C<H;2N+m5{=;XfvKLq_FRP%HD00000NkvXXu0mjf
Dg+;w&

literal 0
HcmV?d00001

diff --git a/docs/assets/design/hybrid_kv_cache_manager/sw_attn.png b/docs/assets/design/hybrid_kv_cache_manager/sw_attn.png
new file mode 100644
index 0000000000000000000000000000000000000000..10aa6146dc7ab478dc60c9383ab9fdb1c5de9d30
GIT binary patch
literal 4560
zcmbtYcT`i)mq&hnMG*vnAX24w5D2~fq>4fSBho=4p*KOg^biOLh*U9vAfVI;NC_oK
zQy>(n2{l9lp%+D@3H#i$f9=^ncF(@^&bu@3y_val=g#MT=Az6XhK%%_^fWXyj7E3$
z9?;Oxx&UR3%e25#w}%X;p}C%8q<80G2zqNi)QXjGOt;&e{c$JemU`^lJCey}*Du;K
z7h7D^FNtkv4-`+f5qmsZkr@49vZ6UzXYx@kVajI1rf6!4y)*Xm<vIk9^VR#bLNRmN
zU%s$a<)56uueBdX^70lA8d20o2ip_^Aj5sAUf4xo@aPUf?I}s1Ky&IF{pt!JU(yP*
z0^-&`_e22kZ~9DQKt!{Ny#@r0xS9L~Ks<Qs{8#gmfA|0G<8I9V)G(=dU&q#Vc6+`x
zE;jaAVBpglJy%z1IrM#3SJ!Q7suP%^BSSgcS@~P~KA4i2__49^Y@8J)&cvtHgM|S7
ze%5{z64HD)!e-d9I@93EDq{Q@YXbs-a5&syfbi&EVI&^RlcvyAij$Cb(V~3sj|kKn
z!>mwm2Jy&7Y+HHXb8>PLoGFCLr=+AvOG)`vP4I<iEaXz|6$kF8a2N-mv2lN%_w{C<
ztUoVt92!yBRC73<tDVUNgTV%$x_9{i75-QA{(+W#eG1}D%PA~0eLrq*XGd9nN|?#Z
z<K#?n@$z~hZ7#;K_>c{Sh<(c5Xrp$C=%1k1mG^@buhSW#ZkSgm(P)PPs=pYF!GRnh
z-&<M>E?+tkPWD)zsaUrODaP9QI5`z1B_-9@*T=@vB6}Jem0-IqU{+RE{oZrf{Zs7?
za-;H8=4hscls6ABbep4&qp53z1u7{I*^4MA>IkcFrxq6%qfn?4cu0tfwszqNRb5@Z
zva%AIiQjU&$-Sx!Zp`%ABscyHYk4u~B}T7*c`pn*m&*R+s`}cOnXgQT5%V@JmzD;-
z#RotPyKmMd3{Gz-Ph=kWtLKU=1#&Ej4e)Ss4&d=`v?7jI-&a&t#_JSSS4)xPrx_Jx
zWz&XS-sI&41qFGyyW4j}&iD1*MQ-fDOiWB192{zEYtz%yLs;<iJDVi#vJySZ@d<Z|
zhRxvu@~DW<sn5F{V|?Nkb63a=N?7;uq$fRtt#9Bs{DtrBoab_C%EHF`2_&mTQMthR
zIb@!EKWwq}O3aSP<G!K6!54s2)z?>6SHtCBEBnlS-c3(^@nUdzn0M3)3XLPHyJB<d
z>w*lqUQhd}+W^LY+KL;6FRIY|SzCXdQCKq$f5A$BNimRPRGj)@HavA|Y)syBs;a@0
zP-EMa>PGg*YH4XzS68>Rv@|p{Oixb(RrwK~Rq9rfBY3p)iMw7!39$CW);;gC@?S}Z
zaq^}@3@8L0OHSnncRk?(0h_RvnV>JSUey>dxWT7N{E2F)zHlVuU1gt=r4+mCHp-`>
zlIXw6J}SV*usPqnZ$8Q+Sbv345gw3JWxszWTE{5twa})jpio~_<fIqw>iRw*ApwKg
zL9vQDa~Q5Wn+vhTBM{GbJL%%%<Fm4|kVqt;e>(YIo0Wq@L2GNcy**E351Cw~Yjp0v
zw|_3x3SRD+{~&EWZP+xS(UQi0e3*I}DmmpkT{>I8U3<fjccD>7jJxOM{l(vbd`9*u
z(%1Nf*ciYBO^2qfWX);eo-{|Zi^#cGuU=(ITDrjDAT~BX6>iRygMQIWuMxqL_wNa(
zMQv?!Gc#e+b#RV#IN+#L+pA524<10yj;({rSxk;L`MGzR8fuS2Y<gnhz>0{?WZ1(z
zdet)9Xmw@ywY5p}^ZDK6yUJgBdq>yy2L=WJJju<?{r1gFt59}!E*n|lZH_}H>8yP@
z$ZnZnB1uH;CVSPF1U4w|Br=A)42HJs;dQxyEjTiqBh1sDA0*MyQnh#zaN_(1zq6tB
zvibBNYkVxlvToc{lJ9v~qski5l<OceYHxq|2VF4y%`Az;2LffG(Wi5+mNj<MGcyvi
zdU|!vD=RDhD(KZ!Hw1HvL2_DJN5~$`)HH37;PR>7b)==WRo1$$XJCL{m%C`mD6Z^U
zx2)I$hyyP_MI!)L)))!5)|$k^YKaF8H!t7`l44>42<|yzgl7KSflEbzFl5taX;Usu
zW<&7XjSm=^>&d-PySAIg0pICHET&()VBq5`)K6yTM@&pkD#**z($e0h$a63xYjdjy
zd4+@=?vlGX{MBu^5M7<(HTBnZz;KJ~3`?lDDsrx?)p8OXxFVf$=W;(@@}iWZW7zrE
zsV^Q3qtubY;D7+zX1}78q58_Vv@wf=uNBPA&B2_E{A^rYvjjr$){NrKl}BvnEob@e
z>`ItvNN7;`M_h}L(u8@AlSZ$_f$Jy4)R*<Ng@SvQj{*u*9b588HeT~IClRXv06uZ|
zeV7C4yWLbs;A-!|RkrtgZmW-NF_ayB&$3OFk@n0^Mwa>LnxGg0uIos#ZyapR>N#=p
zD=7TjTl(syWZQ^Jbncgukx`P9OE(ts_4SoyqrZRuJ{+g?&f3;iJ7o8ku<$>omaH+I
zU{cWOwekJ6iwD&ylkJ-M$R#Pq)26d?3WXB8Kyr59h-PMCI3OJ!f2fVmFDMe9#3BkY
zh1Hc|&Cb6bX*@*$7|3C;qHQ-wtmhYc;`-3!XeY!!>SRmV)N~{^HWq<El#~bs2L8eZ
zP`{%@GBYzb=UbIEHSP8P`FdeM<*Mza@zVipyFk&~>em@LCqvFRd>OER+IT@0j96`?
zmRP07Ed-~B-=<nsRTY5rV`F0_C#cGbio@q;l>Q8qzFNM|lPA!J4~4qz7^T}y=zQAv
zqM{9=l$h`6TRi!*?JL;iT>c>y9c}Vxb)aIjoLUSWDfe8x+Z2_1T1v3RqrP@`>nf&g
zE4+|$&aHg=b~kb^#&pbgmGg8>?flu!EvAmro~6Q?f;H-I6aU>zCm9_vVd2uZZ=-u*
zDQ=jWn$(mO&4^=gJ|xFmK)_lah#i5?3jQcFr?s}UI0MF3Rac*#p8oS^Lrjp3f!oC1
z8%_m>&b%S&bR20&KgacAv`<gk(+<ct%SIO-1cybP(s89pj%LdB7)?CVa`UXI#)vn(
z`93v0mCJGFvbNJCM^RE#oNo&qCy_czOP7Bv<F8%2)^Vys2Gx(Uu(I0Q+n1}*N6H8b
zhdINhWR>f3I?7L+KDHMW^>_VNdfcbT<4c;dlPph<55@=}jzilyiu2KE5eQnPxc1$<
zfAI@NaEW_+drxl<0Ku+m6Kw!qR~Yho-`sy24))d*!S`?UVz7P!rIqideQTvowA=<N
z1cyCKH6@N!fytz2T3p!a!9V{t{z~}1LE0NGPEPu(SCf;IGq^eW*~`h={{H@By3WfK
zN}}RBSp1nepg=fFES5)_>$iPYx5`~&_iu04w4)~65b-A0$F)T!m!&Q76jmlCK8ORE
ziw%Na{7zZn*<1&yjCTHuzvo)&45(vuj%<Q5^e-Rg;pUc-mX?-~7#kmFP410}iIJ0&
z3*H%M#w=s}RM1y%$k?@oTsP&``Or`s=IRywW&~FFdD4Dss4$8>Kq%0nr!3<kRjs+!
zd(%FH@X@tSm+m+e)Ek^-F^P&ykh`<lny4)-jp?dk;NPFzE7=*@I=Q_pH5@wm_4TZe
zDjNFW<;l^4l|`WU4$VM#9TW-$=-3k&Oziki$v|{!DkM0#$vHhlDq`X2froGTeTJyG
zk&WqRi<gGwA}{l6>OxuZDE!p+m}?&CSSfD3c@A%F0sluJX0fY3^QtIRPr%7xo1ks4
zsT(Mx<JLtXpu&v&(494y?57D|1Kd44>ZZLkH8l}0(P%qAzuorm-#sunq#MF4D5$ZZ
zpa3I_tNo^$1MyhrX!}X#1;|S{)ml<Z>)G#>P=<)4A3x~#3LVflILBw_i%LB|Mvjzr
zUcez60o;v>CwdyCb+2x3e~AkB7z9u<&X&}=_ba{Gp0wCzgJd3?m{R6&RrOqG)w&=F
z*V=UZR8dh;ZQEpHZ7mF3A|lyYSs-?H5fPDr-4BTqWZvZ73Jiw&^9cCua*<&gZ$BQN
zhFjf}cQ41xHGUZx7?y+mG-|2iEHHDB{k?y77+NH^G+3~369P?$AxmTzvkc`aR<|9l
zW%ZMbiVTJY?UA)5lCf%zJf?LnCU4Z{)2t}Jf+{_~6QgRaq-1g=t+~0mC%%1qsde;r
zBA66G-PuW_w&o&m1u{TO3gbFgt^fjH8w@GsJQjr6`B5`_HPOR0nv;sU=ZKfsA+~$E
zj-O{{faze{+Rz-VLM$i=NlC+CWV`XjxIcw8Odg&!^ZsH9V>@xFerJb|>h}(6>S<B>
zjNX4Dd6xwZ=;-KN2Xi~geu++Fz}BLtr^jNkSFT*C<7Cubml_`*huYen^rZ`4=C8GC
zi#m_g*6zqT{AMh~ws{s(TUy=SLXoYVypMR9lxJXcXZyEnA#bup*0LoVsnjPiACW*B
zV+_aKjdXjq9KFn!U{GqFpUhlQDWMhhywdD;H1qm)*sFjw3V>t`Sbl!~99c&Z<VQvZ
z1|awi0?}GmM(3^&3&2I1TU(b<g2vBwKZxNZkIpu#?~6`?KMhrtw=bqKuWxJ{JK!Tv
zca;%<9alF7x2}y(bJ~PGq@IB|Q<Quc+T!S#t}!tQBIFem0Jo<C!SK;WT;~lst-mzw
zuak$NnAbf#k0k`%{h||?zyA&-;r!{lYrMuvj8q@E45Ry9>I3}!f#~22y0fm519{J<
z83q;-q8$I)*xK3xPAS3LbLo?ur~jHiKRPLENdj)T!w7Dd`A26XmVZNRC@aU(s(p`G
zfOrhiFG|Ik@3r%_Erfj?3YRM_dLrpHl)5$g6Q;{E^u@GB>SzeoH4$&MB4_o<&D_VQ
z2B7;*O~Fr}%H=~yxCnKm9m4F%6T!lfug{Z1o0Q`KI^FD^MWwj=1~n=w5iIn4hI{-}
z+?-vOTE#TB1kc4~76L7pKp^+MR$P(Ves9nwk-WUT4DcNfkFcLBLqOyjZMq)mz8i*B
z&5=()>ZM<pdS#HELHHVud#9!r@@r}$Fkne1UrNNdTB`lMuN!Bh3*BT9$IEu`r^lVK
z0FOhXxz+Nu!VkQprH!F^Mrf@F&RMU}B~evhVmMsyU5wnaqx_xACjPlts$GBgDR<H1
zTOpH=Sn+br$2_e+Jq4l0bR`Ue$)859@~)89x3+V+>ay3(<HrD`w8S{ro0)xt_0se4
z@!jN2z~$m_kr2qRC#D@KntDrAwnO0TEkIr07r!*HgsX;r$I_%S)7#c}M;yGm1bO{C
z78C@Xe+ro}>>C((RY2(P>MH*9X@4L~QiInQFlKi4LvI2<x&Iy0JzA2<+hfPaK7B1@
z@_YN>q<i-^f*m>cXcZZ~CHyss_jz13)Ex<9a(9OhOI6Mv$u`Z4`hoDH8O7H#16Br)
zDjm9ddh}igN1fj27$y=`z~G0CdlYPQ^JDy&!tGIo+eaq^b6tDrjalcwk@$Kaegs>V
z+bf*$;3nWeTjCums<C-_c{w>ZR?K3{LHG#|hT|=%EpF*8?ma3?naLZaVtosXK(fw&
zaxL}RGd{UrlAsdV&tqAWB?$!cq}(LE7X3TsSGu1U8jED~J8S(Gmk28-L;fZL{x3)W
z|H>I;QjOu?h{O+NNMD$Rz*A#?;ZYzEJp)a4MAR|>d|X$ian6z|_{Sr%z=^OoMopb8
zo0wH!ui=1+n-*x?tPC~!djgNjBoeLm3j0%<4R7V(vvA@pz}sDqv8lgTw$DxL|IA42
o+TA{-13vt}(-!|N-*J8+BiqL-GhLJu=$yt#AEH;I^EmcD05IUwsQ>@~

literal 0
HcmV?d00001

diff --git a/docs/design/hybrid_kv_cache_manager.md b/docs/design/hybrid_kv_cache_manager.md
new file mode 100644
index 0000000000000..8f17b473adc08
--- /dev/null
+++ b/docs/design/hybrid_kv_cache_manager.md
@@ -0,0 +1,245 @@
+# Hybrid KV Cache Manager
+
+!!! warning
+    This document was written based on commit [458e74](https://github.com/vllm-project/vllm/commit/458e74eb907f96069e6d8a4f3c9f457001fef2ea). This feature is still in its early stage and things may change.
+
+## What is a hybrid model?
+
+Many recent "hybrid" LLMs combine multiple attention types within one model. For example:
+
+1. Sliding window attention (sw) + full attention (full): gpt-oss, Gemma 2/3, Ministral, cohere, etc.
+2. Mamba + full: Bamba, Jamba, Minimax, etc.
+3. Local chunked attention + full: Llama4
+
+To serve these models efficiently, our [KVCacheManager][vllm.v1.core.kv_cache_manager.KVCacheManager] must:
+
+1. Allocate different slots to different layer type, for example:
+    - Full attention layers: reserve slots for **all** tokens.
+    - Sliding window layers: reserve slots only for the most recent **`sliding_window_size`** tokens.
+2. Support layer-specific prefix-cache rules, for example:
+    - Full attention: a cache hit prefix requires **all** tokens remain in the KV cache.
+    - Sliding window: a cache hit prefix only requires the last **`sliding_window_size`** tokens remain in the KV cache.
+
+## Definitions
+
+1. **kv hidden size**: The number of bytes to store one token's KV cache for a single layer.
+2. **block**: the memory reserved for kv cache are divided into multiple *blocks* with the same *page size* (defined below)
+3. **block size**: number of tokens inside a block
+4. **page size**: the physical memory size of a block, defined as:
+
+    $$
+    \text{num_layers} \times \text{block_size} \times \text{kv_hidden_size}
+    $$
+
+    `num_layers` doesn't mean the total number of layers in the model. The exact number depends on the context in this doc.
+
+    !!! note
+        This is different from `KVCacheSpec.page_size_bytes` in the code, which is defined as:
+
+        $$
+        \text{block_size} \times \text{kv_hidden_size}
+        $$
+
+## Allocation
+
+### High level idea
+
+We use a single memory pool for all layer types. The memory pool is split into multiple blocks with the same page size. [KVCacheManager][vllm.v1.core.kv_cache_manager.KVCacheManager] allocates different numbers of blocks to different layers according to its attention type.
+
+The core challenge is ensuring every layer type uses the same **page size**.  For full-attention-only models, the page size is straightforward, defined as:
+
+$$
+\text{page_size} = \text{block_size} \times \text{num_hidden_layers} \times \text{kv_hidden_size}
+$$
+
+However, in hybrid models, `num_hidden_layers` varies by attention type, which would normally produce mismatched page sizes. The cases below show how we unify them.
+
+### Case 1: toy model
+
+Let's start with a toy example: a model has 1 full attention layer and 3 sliding window attention layers. All layers have the same `kv_hidden_size`.
+
+We let each block to hold `block_size` tokens for one layer, so:
+
+$$
+\text{page_size} = \text{kv_hidden_size} \times \text{block_size}
+$$
+
+[KVCacheManager][vllm.v1.core.kv_cache_manager.KVCacheManager] allocates a different number of blocks to each layer.
+
+This case is only a toy example. For real models, please refer to the following cases.
+
+### Case 2: same `kv_hidden_size` and a regular pattern
+
+When the model has more layers, e.g., 20 sliding window attention layers and 10 full attention layers with the same `kv_hidden_size`. Calling the allocator once per layer (30 calls) is OK but becomes inefficient. As a solution, we group the allocation of layers that need the same number of blocks to reduce the number of calls.
+
+The grouping is feasible because there is usually a beautiful ratio between the number of different types of layers. For example:
+
+- Gemma-2: 1 sw : 1 full
+- Llama 4: 3 local : 1 full
+
+Our example can be regarded as 2 sw : 1 full. We can allocate blocks as if there are 2 sw and 1 full in the model, and repeat the result by 10 times to generate the `block_ids` for the 30 layers. The page size becomes:
+
+$$
+10 \times \text{kv_hidden_size} \times \text{block_size}
+$$
+
+Assume `block_size` 16, sliding window size 32, request length 112, then for the above example model, we need to allocate 11 blocks (0-6 for full, 7-8 for sw group 1, 9-10 for sw group 2).
+
+![Allocation Result](../assets/design/hybrid_kv_cache_manager/basic_grouping_example.png)
+
+Here, "/" denotes no block needed (sliding‑window layers don't need slots for early tokens).
+
+See the formal definition below. The layers are divided into multiple *KV Cache Groups* so that there is:
+
+1. **Identical attention type inside each group**: Each group only contains layers with the same attention type and thus need the same number of blocks for a given request. This enables layers in the same group share the same block ids without memory waste.
+2. **Identical page size across groups**: Because our memory pool only have one page size.
+
+Our example model is divided into 3 KV cache groups:
+
+- Group 0: 10 full attention layers (full.0 - full.9)
+- Group 1: 10 sliding window attention layers (sw.0 - sw.9)
+- Group 2: 10 sliding window attention layers (sw.10 - sw.19)
+
+Obviously, it satisfies rule 1. For rule 2, all 3 groups have
+
+$$
+10 \times \text{kv_hidden_size} \times \text{block_size}
+$$
+
+as their page size.
+
+### Case 3: same `kv_hidden_size` and no regular pattern
+
+Unfortunately, not all models have such a beautiful ratio, and approach in Case 2 will produce too many small groups. For example, Gemma-3-27b has 52 sliding window attention layers and 10 full attention layers. With the constraints in case 2, it would be 26 sliding window groups and 5 full attention groups, each contains 2 layers. The allocation is still inefficient. To reduce the number of kv cache groups, we group layers using the smallest layer count among all attention types. For example, min(52, 10)=10 layers per group in Gemma-3-27b. Then the grouping result is:
+
+- Group 0: 10 full attention layers (full.0 - full.9)
+- Group 1: 10 sliding window attention layers (sw.0 - sw.9)
+- Group 2: 10 sliding window attention layers (sw.10 - sw.19)
+- ...
+- Group 6: 10 sliding window attention layers (sw.40 - sw.49)
+- Group 7: 2 sliding window attention layers (sw.50 - sw.51) and 8 padding layers
+
+We will update this algorithm if this heuristic leads to a bad result when a new model comes out (e.g., 20 full + 30 sw, the group size should be 10 instead of 20).
+
+This case happens in Gemma-3 series models, and models in case 2 but with eagle speculative decoding which introduce one full attention layer. The solution has some memory waste and is not perfect. Please report any cases where padding overhead becomes unacceptable so we can refine the algorithm.
+
+### Case 4: different `kv_hidden_size` (mainly hybrid mamba models)
+
+Some architectures (e.g., Bamba, Jamba, Minimax) interleave standard attention layers with Mamba layers, where each Mamba layer's state size per token can be much larger than the attention layers' `kv_hidden_size`. Because we only support a single page size across all groups, we must reconcile these differing hidden sizes.
+
+The current algorithm is:
+
+1. Increase the `block_size` of attention layers until
+    $$
+    \text{block_size} \times \text{kv_hidden_size}_{\text{att}} \ge \text{state_size}_{\text{mamba}}
+    $$
+2. Pad the mamba state per layer to
+    $$
+    \text{block_size} \times \text{kv_hidden_size}_{\text{att}}
+    $$
+3. Apply the grouping strategy in case 3.
+
+!!! note
+    This can lead to more than 400 `block_size` for attention layers, which is too large. Another padding strategy is to increase `block_size` until
+
+    $$
+    \text{block_size} \times \text{kv_hidden_size}_{\text{att}} \times \text{num_attn_layers} \ge \text{state_size}_{\text{mamba}}
+    $$
+
+    This padding strategy is still a work in progress.
+
+### Case 5: KV sharing
+
+KV sharing refers to a layer using the KV cache of another layer, e.g., gemma-3n.
+In these models, [KVCacheManager][vllm.v1.core.kv_cache_manager.KVCacheManager] ignores all layers with kv sharing and only allocates KV cache for layers that need kv cache, and some patches are made in model runner to apply the allocation result to kv sharing layers.
+
+## Prefix caching
+
+For simplicity, we assume `block_size=1` in this section.
+
+### High level idea
+
+The block pool uses a dict similar to `tuple(block_hash, group_id) -> block` to catch the full blocks. That means the same tokens of different groups are cached and evicted independently.
+
+When a new request comes in, we check the cache hit prefix of each group, and return the intersection of these groups as the cached prefix of the request. See below for the detailed algorithm for checking the cache hit of one group & performing the intersection.
+
+### Case 0: full attention only models
+
+For full attention layers, blocks are allocated for all tokens in the request. For details on the underlying design, see [Prefix Caching](prefix_caching.md)
+
+To find the longest cache hit prefix of a request, we enumerate from left (the first block) to right (the last block), checking whether the block is cached, and exit when cache misses. For example, we will return the first 7 tokens (0-6) as the cache hit prefix in the below example (blue blocks are cached):
+
+![Prefix Caching of Full Attention](../assets/design/hybrid_kv_cache_manager/full_attn.png)
+
+### Case 1: sliding window attention only models
+
+For sliding window attention layers, a naive implementation for memory allocation is to allocate `sliding_window_size` blocks and fill in the blocks in a round-robin way. But this naive implementation is not compatible with prefix caching so we didn't pick this design. In vLLM,  we allocate different blocks for different tokens and free blocks that are outside the sliding window.
+
+For a new request, the cache hit prefix only requires the last `sliding_window_size - 1` tokens being cached.
+Let's say `sliding_window_size = 4` and `block_size = 1`, and the request is a 15-token prompt (blue blocks are cached):
+
+![Prefix Caching of Sliding Window Attention](../assets/design/hybrid_kv_cache_manager/sw_attn.png)
+
+There are 3 possible cache hit prefixes:
+
+- cache hit length 5, compute prefill with [2, 3, 4] → [5, 6, …, 14]
+- cache hit length 6, compute prefill with [3, 4, 5] → [6, 7, …, 14]
+- cache hit length 14, compute prefill with [11, 12, 13] → [14] (most efficient)
+
+We can check the cache hit from right to left, and early exit when we find a match.This is opposite from full attention, where we check from left to right and early exit when the match fails. One potential cons (compared to full attention) is that we end up iterating over the entire list of tokens when there's no match, which is often a common case. This could potentially cause non-negligible overheads, but fine with full + swa, as discussed below.
+
+### Case 2: sliding window attention + full attention models
+
+The first problem is how to find the cache hit prefix. We need to "intersect" the cache hits of global and sliding window attention layers by:
+
+1. Get the longest cache hit for full attention (scanning from left to right)
+2. Get the longest cache hit for sliding window attention that is within that length. Implemented by checking cache hits from right to left starting from the cache hit length of full attention.
+
+It can be ensured that the resulting cache hit of sliding window attention layers is also a cache hit of full attention layers. This is more efficient than finding all possible prefixes of each group and doing the intersection, because our approach can exit early if there is no cache hit.
+
+The algorithm applies to models with exactly two attention types full attention + X, where X can be an arbitrary efficient attention algorithm like sliding window, llama 4 local attention, and mamba. It doesn't support models without full attention layers, and models with more than 2 types of attention. This is enough for most hybrid models at the moment of writing this doc.
+
+The second question is the cache eviction policy. For now, we use one LRU queue for all kv cache groups. The blocks are added to the LRU queue when freed, either because the request is finished or the block is out of the sliding window.
+
+### Case 3: mamba models
+
+The prefix caching support of the mamba model is work in progress. Once implemented, models with mamba layer + full attention layer can be supported via the full attention + X algorithm in case 2.
+
+## Implementation
+
+### Overview
+
+![Overview of Hybrid KV Cache Manager](../assets/design/hybrid_kv_cache_manager/overview.png)
+
+The `KVCacheManager` is organized into 3 layers:
+
+- **[KVCacheManager][vllm.v1.core.kv_cache_manager.KVCacheManager]**: The interface between the scheduler and kv cache management system.
+- **[KVCacheCoordinator][vllm.v1.core.kv_cache_coordinator.KVCacheCoordinator]**: coordinate per-group SingleTypeKVCacheManagers to generate the allocation result of a request. Depending on the model's configuration, one of these coordinators is chosen:
+    - **[KVCacheCoordinatorNoPrefixCache][vllm.v1.core.kv_cache_coordinator.KVCacheCoordinatorNoPrefixCache]**: Used when prefix caching is disabled.
+    - **[UnitaryKVCacheCoordinator][vllm.v1.core.kv_cache_coordinator.UnitaryKVCacheCoordinator]**: If only one KV cache group. The prefix caching logic is simplified as no intersection is needed.
+    - **[HybridKVCacheCoordinator][vllm.v1.core.kv_cache_coordinator.HybridKVCacheCoordinator]**: Handles exactly two KV cache groups (must include one full‑attention group plus one other efficient‑attention group). Other cases are not implemented. You can disable prefix caching to use the KVCacheCoordinatorNoPrefixCache.
+- **[SingleTypeKVCacheManager][vllm.v1.core.single_type_kv_cache_manager.SingleTypeKVCacheManager]**: Each instance manages allocation and prefix caching for one KV cache group, implementing the attention‑type–specific logic (e.g., full attention, sliding window, Mamba).
+
+The blue box in the above figure shows the case with 10 full attention layers and 20 sliding window attention layers, thus:
+
+- use `HybridKVCacheCoordinator`
+- use 1 `FullAttentionManager` and 2 `SlidingWindowManager` for the 3 `KVCacheGroup`s.
+
+### Memory Layout
+
+For a model with n `KVCacheGroup`s, each with m layers, we allocate m buffers. Each buffer is shared by n layers, one from each group.
+
+The following figure is for a model with 10 full attention layers (full.0 - full.9) and 20 sliding window attention layers (sw.0-sw.19). It follows "case 2" in "Allocation" section and is divided into 3 groups:
+
+- Group 0: 10 full attention layers (full.0 - full.9)
+- Group 1: 10 sliding window attention layers (sw.0 - sw.9)
+- Group 2: 10 sliding window attention layers (sw.10 - sw.19)
+
+And for a request, we allocate 11 blocks with `block_id` 0-6 to group 0, 7-8 to group 1, and 9-10 to group 2.
+
+With such an example, the physical memory is divided into 10 buffers (`KVCacheTensor` 0 - `KVCacheTensor` 9). Each buffer is shared by 3 layers (e.g., `KVCacheTensor` 0 is shared by full.0 from group 0, sw.0 from group 1, and sw.10 from group 2) and is divided into pieces with size `block_size * kv_hidden_size`. The KV cache of these 3 attention layers are saved to different pieces of the buffer based on the allocated `block_ids`:
+
+![Example Memory Layout](../assets/design/hybrid_kv_cache_manager/memory_layout.png)
+
+!!! note
+    One logic "block" is mapped to 10 pieces in the 10 buffers of the physical memory.

From 2f13319f47eb9a78b471c5ced0fcf90862cd16a2 Mon Sep 17 00:00:00 2001
From: Huzaifa Sidhpurwala <huzaifas@redhat.com>
Date: Wed, 27 Aug 2025 00:41:36 +0400
Subject: [PATCH 050/112] Enhance the pre-notification policy (#23532)

Signed-off-by: Huzaifa Sidhpurwala <huzaifas@redhat.com>
---
 SECURITY.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/SECURITY.md b/SECURITY.md
index 414669fb3712e..d6319cdb1ac27 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -42,4 +42,9 @@ For certain security issues of CRITICAL, HIGH, or MODERATE severity level, we ma
 
 * If you wish to be added to the prenotification group, please send an email copying all the members of the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). Each vendor contact will be analyzed on a case-by-case basis.
 
+* Organizations and vendors who either ship or use vLLM, are eligible to join the prenotification group if they meet at least one of the following qualifications
+    * Substantial internal deployment leveraging the upstream vLLM project.
+    * Established internal security teams and comprehensive compliance measures.
+    * Active and consistent contributions to the upstream vLLM project.
+
 * We may withdraw organizations from receiving future prenotifications if they release fixes or any other information about issues before they are public. Group membership may also change based on policy refinements for who may be included.

From 6421b66bf4894a3e1e22d17c78901e3974173e09 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 26 Aug 2025 23:26:46 +0100
Subject: [PATCH 051/112] [Docs] Move quant supported hardware table to README
 (#23663)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/features/quantization/README.md          | 48 ++++++++++++++++++-
 docs/features/quantization/bitblas.md         |  2 +-
 .../quantization/supported_hardware.md        | 32 -------------
 3 files changed, 48 insertions(+), 34 deletions(-)
 delete mode 100644 docs/features/quantization/supported_hardware.md

diff --git a/docs/features/quantization/README.md b/docs/features/quantization/README.md
index e18c128f30fc9..4605ba7781ed4 100644
--- a/docs/features/quantization/README.md
+++ b/docs/features/quantization/README.md
@@ -4,7 +4,6 @@ Quantization trades off model precision for smaller memory footprint, allowing l
 
 Contents:
 
-- [Supported Hardware](supported_hardware.md)
 - [AutoAWQ](auto_awq.md)
 - [AutoRound](auto_round.md)
 - [BitsAndBytes](bnb.md)
@@ -19,3 +18,50 @@ Contents:
 - [AMD Quark](quark.md)
 - [Quantized KV Cache](quantized_kvcache.md)
 - [TorchAO](torchao.md)
+
+## Supported Hardware
+
+The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
+
+<style>
+td:not(:first-child) {
+  text-align: center !important;
+}
+td {
+  padding: 0.5rem !important;
+  white-space: nowrap;
+}
+
+th {
+  padding: 0.5rem !important;
+  min-width: 0 !important;
+}
+
+th:not(:first-child) {
+  writing-mode: vertical-lr;
+  transform: rotate(180deg)
+}
+</style>
+
+| Implementation        | Volta   | Turing   | Ampere   | Ada   | Hopper   | AMD GPU   | Intel GPU   | Intel Gaudi | x86 CPU   | AWS Neuron   | Google TPU   |
+|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------|--------------|--------------|
+| AWQ                   | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ❌         | ✅︎        | ❌          | ❌           |
+| GPTQ                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ❌         | ✅︎        | ❌          | ❌           |
+| Marlin (GPTQ/AWQ/FP8) | ❌      | ❌       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
+| INT8 (W8A8)           | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ✅︎        | ✅︎          | ✅︎           |
+| FP8 (W8A8)            | ❌      | ❌       | ❌       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        | ✅︎          | ❌           |
+| BitBLAS               | ✅︎      | ✅       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
+| BitBLAS (GPTQ)        | ❌      | ❌       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
+| bitsandbytes          | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
+| DeepSpeedFP           | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
+| GGUF                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        | ❌          | ❌           |
+| INC (W8A8)            | ❌      | ❌       | ❌       | ❌    | ❌       | ❌         | ❌          | ✅︎         | ❌        | ❌          | ❌           |
+
+- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
+- ✅︎ indicates that the quantization method is supported on the specified hardware.
+- ❌ indicates that the quantization method is not supported on the specified hardware.
+
+!!! note
+    This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
+
+    For the most up-to-date information on hardware support and quantization methods, please refer to <gh-dir:vllm/model_executor/layers/quantization> or consult with the vLLM development team.
diff --git a/docs/features/quantization/bitblas.md b/docs/features/quantization/bitblas.md
index 6f53a448ee364..53b689ad53ff6 100644
--- a/docs/features/quantization/bitblas.md
+++ b/docs/features/quantization/bitblas.md
@@ -5,7 +5,7 @@ vLLM now supports [BitBLAS](https://github.com/microsoft/BitBLAS) for more effic
 !!! note
     Ensure your hardware supports the selected `dtype` (`torch.bfloat16` or `torch.float16`).
     Most recent NVIDIA GPUs support `float16`, while `bfloat16` is more common on newer architectures like Ampere or Hopper.
-    For details see [supported hardware](supported_hardware.md).
+    For details see [supported hardware](README.md#supported-hardware).
 
 Below are the steps to utilize BitBLAS with vLLM.
 
diff --git a/docs/features/quantization/supported_hardware.md b/docs/features/quantization/supported_hardware.md
deleted file mode 100644
index 06264d08b56aa..0000000000000
--- a/docs/features/quantization/supported_hardware.md
+++ /dev/null
@@ -1,32 +0,0 @@
-# Supported Hardware
-
-The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
-
-<style>
-th {
-  white-space: nowrap;
-  min-width: 0 !important;
-}
-</style>
-
-| Implementation        | Volta   | Turing   | Ampere   | Ada   | Hopper   | AMD GPU   | Intel GPU   | Intel Gaudi | x86 CPU   | AWS Neuron   | Google TPU   |
-|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------|--------------|--------------|
-| AWQ                   | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ❌         | ✅︎        | ❌          | ❌           |
-| GPTQ                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ❌         | ✅︎        | ❌          | ❌           |
-| Marlin (GPTQ/AWQ/FP8) | ❌      | ❌       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
-| INT8 (W8A8)           | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ✅︎        | ✅︎          | ✅︎           |
-| FP8 (W8A8)            | ❌      | ❌       | ❌       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        | ✅︎          | ❌           |
-| BitBLAS (GPTQ)        | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
-| bitsandbytes          | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
-| DeepSpeedFP           | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
-| GGUF                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        | ❌          | ❌           |
-| INC (W8A8)            | ❌      | ❌       | ❌       | ❌    | ❌       | ❌         | ❌          | ✅︎         | ❌        | ❌          | ❌           |
-
-- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
-- ✅︎ indicates that the quantization method is supported on the specified hardware.
-- ❌ indicates that the quantization method is not supported on the specified hardware.
-
-!!! note
-    This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
-
-    For the most up-to-date information on hardware support and quantization methods, please refer to <gh-dir:vllm/model_executor/layers/quantization> or consult with the vLLM development team.

From c3b0fd1ee670079649cd58abd99376bee521a8ff Mon Sep 17 00:00:00 2001
From: Zhonghua Deng <abzhonghua@gmail.com>
Date: Wed, 27 Aug 2025 06:56:16 +0800
Subject: [PATCH 052/112] [V1][P/D]P2pNcclConnector supports flashinfer
 (#23536)

Signed-off-by: Abatom <abzhonghua@gmail.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
---
 .../kv_connector/v1/p2p/p2p_nccl_connector.py | 158 +++++++++---------
 1 file changed, 78 insertions(+), 80 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
index 25675d70fe225..2485c57d86ecc 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
@@ -30,27 +30,19 @@ logger = init_logger(__name__)
 class ReqMeta:
     # Request Id
     request_id: str
-    # Request tokens
-    token_ids: torch.Tensor
-    # Slot mappings, should have the same length as token_ids
-    slot_mapping: torch.Tensor
+    # Request block ids
+    block_ids: torch.Tensor
+    # Request num tokens
+    num_tokens: int
 
     @staticmethod
     def make_meta(request_id: str, token_ids: list[int], block_ids: list[int],
                   block_size: int) -> "ReqMeta":
-        valid_num_tokens = len(token_ids)
-        token_ids_tensor = torch.tensor(token_ids)
         block_ids_tensor = torch.tensor(block_ids)
-        num_blocks = block_ids_tensor.shape[0]
-        block_offsets = torch.arange(0, block_size)
-        slot_mapping = block_offsets.reshape((1, block_size)) + \
-                block_ids_tensor.reshape((num_blocks, 1)) * block_size
-        slot_mapping = slot_mapping.flatten()[:valid_num_tokens]
-
         return ReqMeta(
             request_id=request_id,
-            token_ids=token_ids_tensor,
-            slot_mapping=slot_mapping,
+            block_ids=block_ids_tensor,
+            num_tokens=len(token_ids),
         )
 
 
@@ -123,63 +115,58 @@ class P2pNcclConnector(KVConnectorBase_V1):
             return
 
         def inject_kv_into_layer(
-            dst_kv_cache_layer: torch.Tensor,
-            src_kv_cache: torch.Tensor,
-            slot_mapping: torch.Tensor,
+            layer: torch.Tensor,
+            kv_cache: torch.Tensor,
+            block_ids: torch.Tensor,
             request_id: str,
         ) -> None:
-            """Inject the KV cache into the layer.
+            """
+            Inject KV cache data into a given attention layer tensor.
+
+            This function updates `layer` in-place with values from `kv_cache`,
+            handling different backend layouts:
+              - MLA (Multi-Linear Attention) or FlashInfer: KV tensors are
+                indexed along the first dimension.
+              - FlashAttention: KV tensors are indexed along the second
+                dimension.
+
+            If the number of provided block IDs does not match the number of KV
+            blocks, only the overlapping portion is updated, and a warning is
+            logged.
 
             Args:
-                dst_kv_cache_layer (torch.Tensor): the destination KV cache
-                    layer. In shape [2, num_pages, page_size, xxx] if not
-                    using MLA, [num_pages, page_size, xxx] otherwise.
-                src_kv_cache (torch.Tensor): the source KV cache. In shape
-                    [2, num_tokens, xxx] if not using MLA, [num_tokens, xxx]
-                    otherwise.
-                slot_mapping (torch.Tensor): the slot mapping. In shape
-                    [num_tokens].
-                request_id (str): request id for log
+                layer (torch.Tensor): The attention layer KV tensor to update.
+                kv_cache (torch.Tensor): The KV cache tensor to inject.
+                block_ids (torch.Tensor): Indices of the blocks to update.
+                request_id (str): Request identifier used for logging.
+
+            Returns:
+                None. The function modifies `layer` in-place.
             """
-            dst_kv_cache_layer_shape = dst_kv_cache_layer.shape
-            if isinstance(attn_metadata, MLACommonMetadata):
-                num_pages = dst_kv_cache_layer_shape[0]
-                page_size = dst_kv_cache_layer_shape[1]
-                dst_kv_cache_layer = dst_kv_cache_layer.reshape(
-                    num_pages * page_size, -1)
-                self.check_tensors_except_dim(dst_kv_cache_layer, src_kv_cache,
-                                              0)
-                num_token = src_kv_cache.shape[0]
-                if len(slot_mapping) == num_token:
-                    dst_kv_cache_layer[slot_mapping, ...] = src_kv_cache
+            if (isinstance(attn_metadata, MLACommonMetadata)
+                    or layer.shape[1] == 2):  # MLA or FlashInfer
+                num_block = kv_cache.shape[0]
+                self.check_tensors_except_dim(layer, kv_cache, 0)
+                if len(block_ids) == num_block:
+                    layer[block_ids, ...] = kv_cache
                 else:
-                    dst_kv_cache_layer[slot_mapping[:num_token],
-                                       ...] = src_kv_cache
+                    layer[block_ids[:num_block], ...] = kv_cache
                     logger.warning(
-                        "🚧src_kv_cache does not match, num_slot:%d, "
-                        "num_token:%d, request_id:%s", len(slot_mapping),
-                        num_token, request_id)
+                        "🚧kv_cache does not match, block_ids:%d, "
+                        "num_block:%d, request_id:%s", len(block_ids),
+                        num_block, request_id)
 
-                dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape)
-            else:
-                num_pages = dst_kv_cache_layer_shape[1]
-                page_size = dst_kv_cache_layer_shape[2]
-                dst_kv_cache_layer = dst_kv_cache_layer.reshape(
-                    2, num_pages * page_size, -1)
-                self.check_tensors_except_dim(dst_kv_cache_layer, src_kv_cache,
-                                              1)
-                num_token = src_kv_cache.shape[1]
-                if len(slot_mapping) == num_token:
-                    dst_kv_cache_layer[:, slot_mapping, ...] = src_kv_cache
+            elif layer.shape[0] == 2:  # FlashAttention
+                num_block = kv_cache.shape[1]
+                self.check_tensors_except_dim(layer, kv_cache, 1)
+                if len(block_ids) == num_block:
+                    layer[:, block_ids, ...] = kv_cache
                 else:
-                    dst_kv_cache_layer[:, slot_mapping[:num_token],
-                                       ...] = src_kv_cache
+                    layer[:, block_ids[:num_block], ...] = kv_cache
                     logger.warning(
-                        "🚧src_kv_cache does not match, num_slot:%d, "
-                        "num_token:%d, request_id:%s", len(slot_mapping),
-                        num_token, request_id)
-
-                dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape)
+                        "🚧kv_cache does not match, block_ids:%d, "
+                        "num_block:%d, request_id:%s", len(block_ids),
+                        num_block, request_id)
 
         # Get the metadata
         metadata: KVConnectorMetadata = \
@@ -201,19 +188,17 @@ class P2pNcclConnector(KVConnectorBase_V1):
                 if kv_cache is None:
                     continue
 
-                kv_cache_layer = kv_cache[ \
-                    forward_context.virtual_engine]
+                layer = kv_cache[forward_context.virtual_engine]
 
                 kv_cache = self.p2p_nccl_engine.recv_tensor(
                     request.request_id + "#" + layer_name)
 
                 if kv_cache is None:
-                    logger.warning("🚧src_kv_cache is None, %s",
-                                   request.request_id)
+                    logger.warning("🚧kv_cache is None, %s", request.request_id)
                     continue
 
-                inject_kv_into_layer(kv_cache_layer, kv_cache,
-                                     request.slot_mapping, request.request_id)
+                inject_kv_into_layer(layer, kv_cache, request.block_ids,
+                                     request.request_id)
 
     def wait_for_layer_load(self, layer_name: str) -> None:
         """Blocking until the KV for a specific layer is loaded into vLLM's
@@ -247,20 +232,33 @@ class P2pNcclConnector(KVConnectorBase_V1):
 
         def extract_kv_from_layer(
             layer: torch.Tensor,
-            slot_mapping: torch.Tensor,
+            block_ids: torch.Tensor,
         ) -> torch.Tensor:
-            """Extract the KV cache from the layer.
-
-            Assume the shape of the layer is (2, num_pages, page_size, xxx)
-            if MLA is not used, and (num_pages, page_size, xxx) otherwise.
             """
-            if isinstance(attn_metadata, MLACommonMetadata):
-                num_pages, page_size = layer.shape[0], layer.shape[1]
-                return layer.reshape(num_pages * page_size, -1)[slot_mapping,
-                                                                ...]
-            num_pages, page_size = layer.shape[1], layer.shape[2]
-            return layer.reshape(2, num_pages * page_size, -1)[:, slot_mapping,
-                                                               ...]
+            Extract KV cache slices from a given attention layer tensor.
+
+            This function handles multiple backend layouts:
+              - MLA (Multi-Linear Attention) or FlashInfer: KV tensors are
+                indexed along the first dimension.
+              - FlashAttention: KV tensors are indexed along the second
+                dimension.
+
+            Args:
+                layer (torch.Tensor): The KV cache from the attention layer.
+                block_ids (torch.Tensor): Indices of blocks to extract.
+
+            Returns:
+                torch.Tensor: A tensor containing the extracted KV slices.
+                Returns None if the layout is unsupported.
+            """
+            if (isinstance(attn_metadata, MLACommonMetadata)
+                    or layer.shape[1] == 2):  # MLA or FlashInfer
+                return layer[block_ids, ...]
+
+            if layer.shape[0] == 2:  # FlashAttention
+                return layer[:, block_ids, ...]
+
+            return None
 
         connector_metadata = self._get_connector_metadata()
         assert isinstance(connector_metadata, P2pNcclConnectorMetadata)
@@ -269,7 +267,7 @@ class P2pNcclConnector(KVConnectorBase_V1):
             ip, port = self.parse_request_id(request_id, True)
             remote_address = ip + ":" + str(port + self._rank)
 
-            kv_cache = extract_kv_from_layer(kv_layer, request.slot_mapping)
+            kv_cache = extract_kv_from_layer(kv_layer, request.block_ids)
             self.p2p_nccl_engine.send_tensor(request_id + "#" + layer_name,
                                              kv_cache, remote_address)
 

From 5f1af97f86021cf2819e5ab2d84722dac53c2257 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Wed, 27 Aug 2025 01:28:55 +0200
Subject: [PATCH 053/112] [V1] [Hybrid] Enable Full CUDA graph by default for
 hybrid models in V1 (#22594)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 vllm/model_executor/models/config.py | 42 ++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 882df7e8162c5..f62209326b988 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -4,6 +4,7 @@ from copy import deepcopy
 from typing import TYPE_CHECKING
 
 import vllm.envs as envs
+from vllm.config.compilation import CUDAGraphMode
 from vllm.logger import init_logger
 from vllm.model_executor.models import ModelRegistry
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv
@@ -275,6 +276,42 @@ class GptOssForCausalLMConfig(VerifyAndUpdateConfig):
                     "%d for performance.", 1024)
 
 
+class MambaModelConfig(VerifyAndUpdateConfig):
+
+    @classmethod
+    def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
+        """
+        Enable FULL_AND_PIECEWISE cuda graph mode by default (required
+        to get good performance for mamba layers in V1).
+
+        Args:
+            vllm_config: vLLM Config
+        """
+
+        if not envs.VLLM_USE_V1:
+            return
+
+        model_config = vllm_config.model_config
+        compilation_config = vllm_config.compilation_config
+
+        model_cls, _ = ModelRegistry.resolve_model_cls(
+            model_config.architecture,
+            model_config=model_config,
+        )
+
+        # TODO(tdoublep): remove as full cuda graph support is added
+        FCG_NOT_SUPPORTED_MODELS = [
+            "Lfm2ForCausalLM", "MiniMaxText01ForCausalLM"
+        ]
+
+        if (model_config.architecture not in FCG_NOT_SUPPORTED_MODELS
+                and compilation_config.cudagraph_mode is None):
+            logger.info(
+                "Hybrid or mamba-based model detected: setting cudagraph mode "
+                "to FULL_AND_PIECEWISE in order to optimize performance.")
+            compilation_config.cudagraph_mode = CUDAGraphMode.FULL_AND_PIECEWISE
+
+
 class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
 
     @classmethod
@@ -293,6 +330,9 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
         if not envs.VLLM_USE_V1:
             return
 
+        # Enable FULL_AND_PIECEWISE by default
+        MambaModelConfig.verify_and_update_config(vllm_config)
+
         cache_config = vllm_config.cache_config
         model_config = vllm_config.model_config
         parallel_config = vllm_config.parallel_config
@@ -374,4 +414,6 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
     "JambaForSequenceClassification": JambaForSequenceClassificationConfig,
     "GraniteMoeHybridForCausalLM": GraniteMoeHybridModelConfig,
     "GptOssForCausalLM": GptOssForCausalLMConfig,
+    "MambaForCausalLM": MambaModelConfig,
+    "Mamba2ForCausalLM": MambaModelConfig,
 }

From 714872f1a9c779c2ce9bbf5440f08ec278dc569a Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Tue, 26 Aug 2025 19:48:32 -0400
Subject: [PATCH 054/112] [Compile] Fix Cmake Warning (#23689)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b0ed4a284db95..b0eb0f32e03a5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -30,7 +30,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
 # Supported python versions.  These versions will be searched in order, the
 # first match will be selected.  These should be kept in sync with setup.py.
 #
-set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12", "3.13")
+set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")
 
 # Supported AMD GPU architectures.
 set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")

From 585e0bde36abdb2ab2967fd42005cbe62459020e Mon Sep 17 00:00:00 2001
From: Federico <65908512+coval3nte@users.noreply.github.com>
Date: Wed, 27 Aug 2025 02:29:52 +0200
Subject: [PATCH 055/112] [Bugfix] UnboundLocalError when GptOss reasoning
 specified (#23054)

Signed-off-by: Federico <65908512+coval3nte@users.noreply.github.com>
---
 vllm/entrypoints/openai/serving_chat.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 8b50153f01152..7e0e627780970 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -887,7 +887,8 @@ class OpenAIServingChat(OpenAIServing):
                         delta_message = DeltaMessage(content=delta_text)
 
                     # update the previous values for the next iteration
-                    if tool_choice_auto or self.reasoning_parser:
+                    if ((tool_choice_auto or self.reasoning_parser)
+                            and not self.use_harmony):
                         assert previous_texts is not None
                         assert all_previous_token_ids is not None
                         previous_texts[i] = current_text

From b1625dbe9cee497c0eefd9d1221377f64fec1e03 Mon Sep 17 00:00:00 2001
From: zixuanzhang226 <zixuanzhang@bytedance.com>
Date: Tue, 26 Aug 2025 18:06:10 -0700
Subject: [PATCH 056/112] feat: add triton fused moe config for GLM-4.5-Air-FP8
 on B200 (#23695)

Signed-off-by: Zixuan Zhang <zixuanzhang@bytedance.com>
---
 ...evice_name=NVIDIA_B200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
 1 file changed, 146 insertions(+)
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json

diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..b962d19506ce5
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}

From 6891205b161e78ea6e255da194a4470e06997a3b Mon Sep 17 00:00:00 2001
From: wuhang <wuhang6@huawei.com>
Date: Wed, 27 Aug 2025 09:06:58 +0800
Subject: [PATCH 057/112] [Feature][Responses API] Support MCP tool in
 background mode (#23494)

Signed-off-by: wuhang <wuhang6@huawei.com>
---
 vllm/entrypoints/context.py                  |  31 ++-
 vllm/entrypoints/openai/serving_responses.py | 265 ++++++++++---------
 2 files changed, 162 insertions(+), 134 deletions(-)

diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
index f70e1fc207f86..9d587e8669339 100644
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -4,13 +4,15 @@ import json
 import logging
 from abc import ABC, abstractmethod
 from collections.abc import Sequence
-from typing import TYPE_CHECKING, Union
+from contextlib import AsyncExitStack
+from typing import TYPE_CHECKING, Optional, Union
 
 from openai_harmony import Author, Message, Role, StreamState, TextContent
 
 from vllm.entrypoints.harmony_utils import (
     get_encoding, get_streamable_parser_for_assistant, render_for_completion)
 from vllm.entrypoints.tool import Tool
+from vllm.entrypoints.tool_server import ToolServer
 from vllm.outputs import RequestOutput
 
 if TYPE_CHECKING:
@@ -37,6 +39,11 @@ class ConversationContext(ABC):
     def render_for_completion(self) -> list[int]:
         pass
 
+    @abstractmethod
+    async def init_tool_sessions(self, tool_server: Optional[ToolServer],
+                                 exit_stack: AsyncExitStack) -> None:
+        pass
+
 
 class SimpleContext(ConversationContext):
 
@@ -55,16 +62,21 @@ class SimpleContext(ConversationContext):
     def render_for_completion(self) -> list[int]:
         raise NotImplementedError("Should not be called.")
 
+    async def init_tool_sessions(self, tool_server: Optional[ToolServer],
+                                 exit_stack: AsyncExitStack) -> None:
+        pass
+
 
 class HarmonyContext(ConversationContext):
 
     def __init__(
         self,
         messages: list,
-        tool_sessions: dict[str, Tool],
+        available_tools: list[str],
     ):
         self._messages = messages
-        self.tool_sessions = tool_sessions
+        self.available_tools = available_tools
+        self._tool_sessions: dict[str, Union[ClientSession, Tool]] = {}
 
         self.parser = get_streamable_parser_for_assistant()
         self.num_init_messages = len(messages)
@@ -116,10 +128,10 @@ class HarmonyContext(ConversationContext):
         if recipient is not None:
             if recipient.startswith("browser."):
                 return await self.call_search_tool(
-                    self.tool_sessions["browser"], last_msg)
+                    self._tool_sessions["browser"], last_msg)
             elif recipient.startswith("python"):
                 return await self.call_python_tool(
-                    self.tool_sessions["python"], last_msg)
+                    self._tool_sessions["python"], last_msg)
         raise ValueError("No tool call found")
 
     def render_for_completion(self) -> list[int]:
@@ -161,6 +173,15 @@ class HarmonyContext(ConversationContext):
                     recipient=Role.ASSISTANT)
         ]
 
+    async def init_tool_sessions(self, tool_server: Optional[ToolServer],
+                                 exit_stack: AsyncExitStack) -> None:
+        if tool_server:
+            for tool_name in self.available_tools:
+                if tool_name not in self._tool_sessions:
+                    self._tool_sessions[
+                        tool_name] = await exit_stack.enter_async_context(
+                            tool_server.new_session(tool_name))
+
 
 class StreamingHarmonyContext(HarmonyContext):
 
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 5adcb310e3468..67eec2d523e3f 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -8,7 +8,7 @@ from collections.abc import AsyncGenerator, AsyncIterator, Sequence
 from contextlib import AsyncExitStack
 from copy import copy
 from http import HTTPStatus
-from typing import Any, Callable, Final, Optional, Union
+from typing import Callable, Final, Optional, Union
 
 import jinja2
 import openai.types.responses as openai_responses_types
@@ -248,10 +248,10 @@ class OpenAIServingResponses(OpenAIServing):
             raw_request.state.request_metadata = request_metadata
 
         if self.tool_server is not None and isinstance(
-                self.tool_server, MCPToolServer
-        ) and (request.background or request.stream) and request.tools and any(
-                tool.type in ["web_search_preview", "code_interpreter"]
-                for tool in request.tools):
+                self.tool_server,
+                MCPToolServer) and request.stream and request.tools and any(
+                    tool.type in ["web_search_preview", "code_interpreter"]
+                    for tool in request.tools):
             return self.create_error_response(
                 "MCP tool server is not supported in background mode and "
                 "streaming mode")
@@ -265,103 +265,70 @@ class OpenAIServingResponses(OpenAIServing):
                 builtin_tool_list.append("browser")
             if self.tool_server.has_tool("python"):
                 builtin_tool_list.append("python")
-        async with AsyncExitStack() as exit_stack:
-            try:
-                if self.tool_server is not None:
-                    # TODO: initialize tool sessions lazily when the session
-                    # is actually used.
-                    tool_session_ctxs: dict[str, Any] = {
-                        tool_name:
-                        exit_stack.enter_async_context(
-                            self.tool_server.new_session(tool_name))
-                        for tool_name in builtin_tool_list
-                    }
-                    tool_sessions = {}
-                    for tool_name in builtin_tool_list:
-                        tool_sessions[tool_name] = (
-                            await tool_session_ctxs[tool_name])
-                else:
-                    assert len(builtin_tool_list) == 0
-                    tool_sessions = {}
-                for i, engine_prompt in enumerate(engine_prompts):
-                    default_max_tokens = self.max_model_len - len(
-                        engine_prompt["prompt_token_ids"])
-                    sampling_params = request.to_sampling_params(
-                        default_max_tokens, self.default_sampling_params)
 
-                    trace_headers = (None if raw_request is None else await
-                                     self._get_trace_headers(
-                                         raw_request.headers))
+        if self.tool_server is not None:
+            available_tools = builtin_tool_list
+        else:
+            assert len(builtin_tool_list) == 0
+            available_tools = []
+        try:
+            for i, engine_prompt in enumerate(engine_prompts):
+                default_max_tokens = self.max_model_len - len(
+                    engine_prompt["prompt_token_ids"])
+                sampling_params = request.to_sampling_params(
+                    default_max_tokens, self.default_sampling_params)
 
-                    context: ConversationContext
-                    if self.use_harmony:
-                        if request.stream:
-                            context = StreamingHarmonyContext(
-                                messages, tool_sessions)
-                        else:
-                            context = HarmonyContext(messages, tool_sessions)
+                trace_headers = (None if raw_request is None else await
+                                 self._get_trace_headers(raw_request.headers))
+
+                context: ConversationContext
+                if self.use_harmony:
+                    if request.stream:
+                        context = StreamingHarmonyContext(
+                            messages, available_tools)
                     else:
-                        context = SimpleContext()
-                    generator = self._generate_with_builtin_tools(
-                        request_id=request.request_id,
-                        request_prompt=request_prompts[i],
-                        engine_prompt=engine_prompt,
-                        sampling_params=sampling_params,
-                        context=context,
-                        lora_request=lora_request,
-                        priority=request.priority,
-                        trace_headers=trace_headers,
-                    )
-                    generators.append(generator)
-            except ValueError as e:
-                # TODO: Use a vllm-specific Validation Error
-                return self.create_error_response(str(e))
-
-            assert len(generators) == 1
-            result_generator, = generators
-
-            # Store the input messages.
-            if request.store:
-                self.msg_store[request.request_id] = messages
-
-            if request.background:
-                created_time = int(time.time())
-                response = ResponsesResponse.from_request(
-                    request,
-                    sampling_params,
-                    model_name=model_name,
-                    created_time=created_time,
-                    output=[],
-                    status="queued",
-                    usage=None,
+                        context = HarmonyContext(messages, available_tools)
+                else:
+                    context = SimpleContext()
+                generator = self._generate_with_builtin_tools(
+                    request_id=request.request_id,
+                    request_prompt=request_prompts[i],
+                    engine_prompt=engine_prompt,
+                    sampling_params=sampling_params,
+                    context=context,
+                    lora_request=lora_request,
+                    priority=request.priority,
+                    trace_headers=trace_headers,
                 )
-                async with self.response_store_lock:
-                    self.response_store[response.id] = response
+                generators.append(generator)
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
 
-                # Run the request in the background.
-                task = asyncio.create_task(
-                    self._run_background_request(
-                        request,
-                        sampling_params,
-                        result_generator,
-                        context,
-                        model_name,
-                        tokenizer,
-                        request_metadata,
-                        created_time,
-                    ),
-                    name=f"create_{response.id}",
-                )
+        assert len(generators) == 1
+        result_generator, = generators
 
-                # For cleanup.
-                response_id = response.id
-                self.background_tasks[response_id] = task
-                task.add_done_callback(
-                    lambda _: self.background_tasks.pop(response_id, None))
-                return response
+        # Store the input messages.
+        if request.store:
+            self.msg_store[request.request_id] = messages
 
-            if request.stream:
-                return self.responses_stream_generator(
+        if request.background:
+            created_time = int(time.time())
+            response = ResponsesResponse.from_request(
+                request,
+                sampling_params,
+                model_name=model_name,
+                created_time=created_time,
+                output=[],
+                status="queued",
+                usage=None,
+            )
+            async with self.response_store_lock:
+                self.response_store[response.id] = response
+
+            # Run the request in the background.
+            task = asyncio.create_task(
+                self._run_background_request(
                     request,
                     sampling_params,
                     result_generator,
@@ -369,21 +336,41 @@ class OpenAIServingResponses(OpenAIServing):
                     model_name,
                     tokenizer,
                     request_metadata,
-                )
+                    created_time,
+                ),
+                name=f"create_{response.id}",
+            )
 
-            try:
-                return await self.responses_full_generator(
-                    request,
-                    sampling_params,
-                    result_generator,
-                    context,
-                    model_name,
-                    tokenizer,
-                    request_metadata,
-                )
-            except Exception as e:
-                return self.create_error_response(str(e))
-        return self.create_error_response("Should not reach here")
+            # For cleanup.
+            response_id = response.id
+            self.background_tasks[response_id] = task
+            task.add_done_callback(
+                lambda _: self.background_tasks.pop(response_id, None))
+            return response
+
+        if request.stream:
+            return self.responses_stream_generator(
+                request,
+                sampling_params,
+                result_generator,
+                context,
+                model_name,
+                tokenizer,
+                request_metadata,
+            )
+
+        try:
+            return await self.responses_full_generator(
+                request,
+                sampling_params,
+                result_generator,
+                context,
+                model_name,
+                tokenizer,
+                request_metadata,
+            )
+        except Exception as e:
+            return self.create_error_response(str(e))
 
     async def _make_request(
         self,
@@ -439,14 +426,16 @@ class OpenAIServingResponses(OpenAIServing):
         if created_time is None:
             created_time = int(time.time())
 
-        try:
-            async for _ in result_generator:
-                pass
-        except asyncio.CancelledError:
-            return self.create_error_response("Client disconnected")
-        except ValueError as e:
-            # TODO: Use a vllm-specific Validation Error
-            return self.create_error_response(str(e))
+        async with AsyncExitStack() as exit_stack:
+            try:
+                await context.init_tool_sessions(self.tool_server, exit_stack)
+                async for _ in result_generator:
+                    pass
+            except asyncio.CancelledError:
+                return self.create_error_response("Client disconnected")
+            except ValueError as e:
+                # TODO: Use a vllm-specific Validation Error
+                return self.create_error_response(str(e))
 
         if self.use_harmony:
             assert isinstance(context, HarmonyContext)
@@ -838,7 +827,7 @@ class OpenAIServingResponses(OpenAIServing):
             status_code=HTTPStatus.BAD_REQUEST,
         )
 
-    async def responses_stream_generator(
+    async def _process_streaming_events(
         self,
         request: ResponsesRequest,
         sampling_params: SamplingParams,
@@ -847,18 +836,8 @@ class OpenAIServingResponses(OpenAIServing):
         model_name: str,
         tokenizer: AnyTokenizer,
         request_metadata: RequestResponseMetadata,
-        created_time: Optional[int] = None,
+        created_time: int,
     ) -> AsyncGenerator[str, None]:
-        # TODO:
-        # 1. Handle disconnect
-
-        if not isinstance(context, StreamingHarmonyContext):
-            raise NotImplementedError(
-                "Streaming is not supported for responses API without Harmony."
-            )
-
-        created_time = created_time or int(time.time())
-
         sequence_number = 0
 
         def _send_event(event: BaseModel):
@@ -1270,3 +1249,31 @@ class OpenAIServingResponses(OpenAIServing):
                 sequence_number=-1,
                 response=final_response.model_dump(),
             ))
+
+    async def responses_stream_generator(
+        self,
+        request: ResponsesRequest,
+        sampling_params: SamplingParams,
+        result_generator: AsyncIterator[Optional[ConversationContext]],
+        context: ConversationContext,
+        model_name: str,
+        tokenizer: AnyTokenizer,
+        request_metadata: RequestResponseMetadata,
+        created_time: Optional[int] = None,
+    ) -> AsyncGenerator[str, None]:
+        # TODO:
+        # 1. Handle disconnect
+
+        if not isinstance(context, StreamingHarmonyContext):
+            raise NotImplementedError(
+                "Streaming is not supported for responses API without Harmony."
+            )
+
+        created_time = created_time or int(time.time())
+
+        async with AsyncExitStack() as exit_stack:
+            await context.init_tool_sessions(self.tool_server, exit_stack)
+            async for event_data in self._process_streaming_events(
+                    request, sampling_params, result_generator, context,
+                    model_name, tokenizer, request_metadata, created_time):
+                yield event_data

From c7c80af084e4d87c4e73148cb71ee990970281ff Mon Sep 17 00:00:00 2001
From: yzds <41983536+youzhedian@users.noreply.github.com>
Date: Wed, 27 Aug 2025 09:21:11 +0800
Subject: [PATCH 058/112] fix pynccl reduce_scatter (#23648)

Co-authored-by: hongchao <hongchao@msh.team>
---
 vllm/distributed/device_communicators/cuda_communicator.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py
index 0ea8de2f36f4b..eef3f9f75f9f1 100644
--- a/vllm/distributed/device_communicators/cuda_communicator.py
+++ b/vllm/distributed/device_communicators/cuda_communicator.py
@@ -152,7 +152,7 @@ class CudaCommunicator(DeviceCommunicatorBase):
                              dtype=input_tensor.dtype,
                              device=input_tensor.device)
 
-        pynccl_comm.reduce_scatter(output, input_)
+        pynccl_comm.reduce_scatter(output, input_tensor)
 
         # Reshape before returning
         return output.movedim(0, dim).contiguous()
@@ -186,9 +186,9 @@ class CudaCommunicator(DeviceCommunicatorBase):
                              device=input_tensor.device)
 
         if sizes is not None:
-            pynccl_comm.reduce_scatterv(output, input_, sizes=sizes)
+            pynccl_comm.reduce_scatterv(output, input_tensor, sizes=sizes)
         else:
-            pynccl_comm.reduce_scatter(output, input_)
+            pynccl_comm.reduce_scatter(output, input_tensor)
 
         # Reshape before returning
         return output.movedim(0, dim).contiguous()

From 2c2b140ae8c60dc0c38e4d37274fc7106a72c21b Mon Sep 17 00:00:00 2001
From: czhu-cohere <conway.zhu@cohere.com>
Date: Tue, 26 Aug 2025 21:23:23 -0400
Subject: [PATCH 059/112] [quantization] use channel scales for w4a8 + misc
 fixes (#23570)

Signed-off-by: czhu-cohere <conway.zhu@cohere.com>
---
 tests/quantization/test_compressed_tensors.py | 44 +++++++++++++++++--
 .../schemes/compressed_tensors_w4a8_fp8.py    | 13 +++++-
 .../kernels/mixed_precision/MPLinearKernel.py |  1 +
 .../kernels/mixed_precision/cutlass.py        | 19 ++++----
 4 files changed, 63 insertions(+), 14 deletions(-)

diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 296743dbfa041..b9774b7ee2631 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -14,10 +14,10 @@ from compressed_tensors.quantization import QuantizationType
 from tests.models.utils import check_logprobs_close
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
     CompressedTensors24, CompressedTensorsLinearMethod,
-    CompressedTensorsW4A4Fp4, CompressedTensorsW4A16Fp4,
-    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
-    CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
-    CompressedTensorsWNA16)
+    CompressedTensorsW4A4Fp4, CompressedTensorsW4A8Fp8,
+    CompressedTensorsW4A16Fp4, CompressedTensorsW4A16Sparse24,
+    CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
+    CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     cutlass_fp4_supported)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
@@ -683,3 +683,39 @@ def test_compressed_tensors_nvfp4(vllm_runner, args):
         output = llm.generate_greedy("Hello my name is", max_tokens=20)
         print(output)
         assert output
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda()
+    or not current_platform.has_device_capability(90),
+    reason="W4A8 FP8 is not yet supported on this GPU type.",
+)
+@pytest.mark.parametrize("args", [
+    ("czhu-cohere/TinyLlama-1.1B-Chat-v1.0-W4A8-e2e", CompressedTensorsW4A8Fp8)
+])
+def test_compressed_tensors_w4a8_fp8(vllm_runner, args):
+    model, scheme = args
+    with vllm_runner(model, enforce_eager=True) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            o_proj = layer.self_attn.o_proj
+            gate_up_proj = layer.mlp.gate_up_proj
+            down_proj = layer.mlp.down_proj
+
+            for proj in (qkv_proj, o_proj, gate_up_proj, down_proj):
+                assert isinstance(proj.quant_method,
+                                  CompressedTensorsLinearMethod)
+                assert isinstance(proj.scheme, scheme)
+
+                assert proj.weight_packed.dtype is torch.int32
+                assert proj.weight_scale.dtype is torch.float8_e4m3fn
+                assert proj.weight_chan_scale.dtype is torch.float32
+                assert proj.scheme.group_size == 128
+
+        llm.apply_model(check_model)
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
+        assert output
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py
index f6cc49c2316ba..3d9827058803e 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py
@@ -79,7 +79,8 @@ class CompressedTensorsW4A8Fp8(CompressedTensorsScheme):
             act_type=torch.float8_e4m3fn,  # always use fp8(e4m3)
             group_size=self.group_size,
             zero_points=not self.symmetric,
-            has_g_idx=self.has_g_idx
+            has_g_idx=self.has_g_idx,
+            out_type=params_dtype
         )
 
         kernel_type = choose_mp_linear_kernel(mp_linear_kernel_config)
@@ -122,7 +123,7 @@ class CompressedTensorsW4A8Fp8(CompressedTensorsScheme):
             torch.empty(
                 output_size_per_partition,
                 scales_and_zp_size,
-                dtype=params_dtype,
+                dtype=torch.float8_e4m3fn,
             )
         }
 
@@ -140,9 +141,17 @@ class CompressedTensorsW4A8Fp8(CompressedTensorsScheme):
                                                           dtype=torch.int64),
                                          weight_loader=weight_loader)
 
+        # per-channel scales
+        weight_chan_scale = ChannelQuantScaleParameter(
+            data=torch.empty((output_size_per_partition, 1),
+                             dtype=torch.float32),
+            output_dim=0,
+            weight_loader=weight_loader)
+
         layer.register_parameter("weight_packed", weight)
         layer.register_parameter("weight_scale", weight_scale)
         layer.register_parameter("weight_shape", weight_shape)
+        layer.register_parameter("weight_chan_scale", weight_chan_scale)
 
         self.kernel = kernel_type(mp_linear_kernel_config,
                                   w_q_param_name="weight_packed",
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
index 07ecc096231a4..1280f5f1eadf7 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
@@ -20,6 +20,7 @@ class MPLinearLayerConfig:
     group_size: int
     zero_points: bool
     has_g_idx: bool
+    out_type: Optional[torch.dtype] = None
 
 
 class MPLinearKernel(ABC):
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py
index f1d49693fc016..9e23c0dd3595b 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py
@@ -60,13 +60,17 @@ class CutlassW4A8LinearKernel(MPLinearKernel):
         if in_features % 128 or out_features % 128:
             return False, "K and N must be divisible by 128, got "\
                            f"{c.partition_weight_shape}"
+
+        if c.out_type != torch.bfloat16:
+            return False, "Only bfloat16 output type currently supported"\
+                           f"got {c.out_type=}"
+
         return True, None
 
     # note assumes that
     #  `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
     #  `weight_scale`  is: {input_dim = 0, output_dim = 1}
     def process_weights_after_loading(self, layer: torch.nn.Module):
-        c = self.config
 
         # TODO(czhu): optimize speed/mem usage
         def transform_w_q(x):
@@ -86,19 +90,15 @@ class CutlassW4A8LinearKernel(MPLinearKernel):
         # Encode/reorder weights and pack scales
         self._transform_param(layer, self.w_q_name, transform_w_q)
         self._transform_param(layer, self.w_s_name, transform_w_s)
-
-        # TODO(czhu): support loading channel scales
-        self.w_ch_s = torch.ones((c.partition_weight_shape[1], ),
-                                 dtype=torch.float32,
-                                 device='cuda')
+        self._transform_param(layer, "weight_chan_scale", lambda x: x)
 
     def apply_weights(self,
                       layer: torch.nn.Module,
                       x: torch.Tensor,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        assert bias is None, "bias not supported by CUTLASS W4A8"
         c = self.config
         w_q, w_s, _, _ = self._get_weight_params(layer)
+        w_ch_s = layer.weight_chan_scale
 
         x_2d = x.reshape(-1, x.shape[-1])
         out_shape = x.shape[:-1] + (c.partition_weight_shape[1], )
@@ -109,6 +109,9 @@ class CutlassW4A8LinearKernel(MPLinearKernel):
                                      b_group_scales=w_s,
                                      b_group_size=c.group_size,
                                      a_token_scales=act_scales,
-                                     b_channel_scales=self.w_ch_s)
+                                     b_channel_scales=w_ch_s)
+
+        if bias is not None:
+            output.add_(bias)  # In-place add
 
         return output.reshape(out_shape)

From eb1995167e04e01c465e1cf4c39d5fd0b2031724 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Tue, 26 Aug 2025 18:23:26 -0700
Subject: [PATCH 060/112] [gpt-oss] Enable unit test for response API harmony
 integration (#23533)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 .../openai/test_response_api_with_harmony.py  | 45 ++++++++++++-------
 1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py
index 1ca52599c519d..72d468db08f65 100644
--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@@ -11,18 +11,25 @@ from openai import BadRequestError, NotFoundError, OpenAI
 
 from ...utils import RemoteOpenAIServer
 
-pytest.skip(allow_module_level=True, reason="gpt-oss can't run on CI yet.")
-
 MODEL_NAME = "openai/gpt-oss-20b"
-DTYPE = "bfloat16"
 
 
 @pytest.fixture(scope="module")
-def server():
+def monkeypatch_module():
+    from _pytest.monkeypatch import MonkeyPatch
+    mpatch = MonkeyPatch()
+    yield mpatch
+    mpatch.undo()
+
+
+@pytest.fixture(scope="module")
+def server(monkeypatch_module: pytest.MonkeyPatch):
     args = ["--enforce-eager", "--tool-server", "demo"]
 
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
-        yield remote_server
+    with monkeypatch_module.context() as m:
+        m.setenv("VLLM_ENABLE_RESPONSES_API_STORE", "1")
+        with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+            yield remote_server
 
 
 @pytest_asyncio.fixture
@@ -269,10 +276,11 @@ async def test_stateful_multi_turn(client: OpenAI, model_name: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_streaming(client: OpenAI, model_name: str):
+    # TODO: Add back when web search and code interpreter are available in CI
     prompts = [
         "tell me a story about a cat in 20 words",
-        "What is 13 * 24? Use python to calculate the result.",
-        "When did Jensen found NVIDIA? Search it and answer the year only.",
+        # "What is 13 * 24? Use python to calculate the result.",
+        # "When did Jensen found NVIDIA? Search it and answer the year only.",
     ]
 
     for prompt in prompts:
@@ -281,15 +289,15 @@ async def test_streaming(client: OpenAI, model_name: str):
             input=prompt,
             reasoning={"effort": "low"},
             tools=[
-                {
-                    "type": "web_search_preview"
-                },
-                {
-                    "type": "code_interpreter",
-                    "container": {
-                        "type": "auto"
-                    }
-                },
+                # {
+                #     "type": "web_search_preview"
+                # },
+                # {
+                #     "type": "code_interpreter",
+                #     "container": {
+                #         "type": "auto"
+                #     }
+                # },
             ],
             stream=True,
         )
@@ -317,6 +325,7 @@ async def test_streaming(client: OpenAI, model_name: str):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.skip(reason="Web search tool is not available in CI yet.")
 async def test_web_search(client: OpenAI, model_name: str):
     response = await client.responses.create(
         model=model_name,
@@ -331,6 +340,7 @@ async def test_web_search(client: OpenAI, model_name: str):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.skip(reason="Code interpreter tool is not available in CI yet.")
 async def test_code_interpreter(client: OpenAI, model_name: str):
     response = await client.responses.create(
         model=model_name,
@@ -436,6 +446,7 @@ async def test_function_calling(client: OpenAI, model_name: str):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.flaky(reruns=5)
 async def test_function_calling_multi_turn(client: OpenAI, model_name: str):
     tools = [
         {

From de02b07db4741cc9ed40b8262d7a67e6bce30211 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 26 Aug 2025 21:34:57 -0400
Subject: [PATCH 061/112] [Bugfix] Lazy import gpt_oss_triton_kernels_moe for
 mxfp4 (#23678)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/model_executor/layers/quantization/mxfp4.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index df96e5d8c413e..bdeb169a4b97f 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -10,8 +10,6 @@ from vllm.config import get_current_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig,
                                                   FusedMoEMethodBase)
-from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
-    triton_kernel_moe_forward)
 from vllm.model_executor.layers.linear import (LinearBase,
                                                UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -557,6 +555,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
             )[0]
             return trtllm_gen_output
         else:
+            from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (  # noqa: E501
+                triton_kernel_moe_forward)
             return triton_kernel_moe_forward(
                 hidden_states=x,
                 w1=self.w13_weight_triton_tensor,

From 6dab89b8ece7e022bd3df5774c9ddf309e2eb2d9 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 27 Aug 2025 02:47:08 +0100
Subject: [PATCH 062/112] [Docs] Fix math rendering in docs (#23676)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/mkdocs/javascript/mathjax.js | 20 ++++++++++++++++++++
 mkdocs.yaml                       |  7 ++++---
 requirements/docs.txt             |  1 -
 3 files changed, 24 insertions(+), 4 deletions(-)
 create mode 100644 docs/mkdocs/javascript/mathjax.js

diff --git a/docs/mkdocs/javascript/mathjax.js b/docs/mkdocs/javascript/mathjax.js
new file mode 100644
index 0000000000000..5da0d443578c4
--- /dev/null
+++ b/docs/mkdocs/javascript/mathjax.js
@@ -0,0 +1,20 @@
+// Enables MathJax rendering
+window.MathJax = {
+  tex: {
+    inlineMath: [["\\(", "\\)"]],
+    displayMath: [["\\[", "\\]"]],
+    processEscapes: true,
+    processEnvironments: true
+  },
+  options: {
+    ignoreHtmlClass: ".*|",
+    processHtmlClass: "arithmatex"
+  }
+};
+
+document$.subscribe(() => { 
+  MathJax.startup.output.clearCache()
+  MathJax.typesetClear()
+  MathJax.texReset()
+  MathJax.typesetPromise()
+})
diff --git a/mkdocs.yaml b/mkdocs.yaml
index 47fe1ebce9712..507a80c41e8b4 100644
--- a/mkdocs.yaml
+++ b/mkdocs.yaml
@@ -129,15 +129,16 @@ markdown_extensions:
   - toc:
       permalink: true
   # For math rendering
-  - mdx_math:
-      enable_dollar_delimiter: true
+  - pymdownx.arithmatex:
+      generic: true
 
 extra_css:
   - mkdocs/stylesheets/extra.css
 
 extra_javascript:
   - mkdocs/javascript/run_llm_widget.js
-  - https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML
+  - mkdocs/javascript/mathjax.js
+  - https://unpkg.com/mathjax@3.2.2/es5/tex-mml-chtml.js
   - mkdocs/javascript/edit_and_feedback.js
   - mkdocs/javascript/slack_and_forum.js
 
diff --git a/requirements/docs.txt b/requirements/docs.txt
index 3b72a8a9e755e..d1c546398780a 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -7,7 +7,6 @@ mkdocs-awesome-nav
 mkdocs-glightbox
 mkdocs-git-revision-date-localized-plugin
 mkdocs-minify-plugin
-python-markdown-math
 regex
 ruff
 

From fecbb7c782980d0d9d104784a233ecb95a20ddda Mon Sep 17 00:00:00 2001
From: Wei <weiweinpu@gmail.com>
Date: Tue, 26 Aug 2025 19:54:23 -0700
Subject: [PATCH 063/112] [Bugfix][gpt-oss] passing the cache config in gpt-oss
 (#23613)

Signed-off-by: Wei Wei <wwei6@meta.com>
---
 vllm/model_executor/models/gpt_oss.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index cd93f0ef1e310..9c1c05320cf36 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -174,12 +174,15 @@ class TransformerBlock(torch.nn.Module):
     def __init__(
         self,
         config: GptOssConfig,
+        cache_config: CacheConfig,
         quant_config: QuantizationConfig,
         prefix: str = "",
     ):
         super().__init__()
         self.layer_idx = extract_layer_index(prefix)
-        self.attn = OAIAttention(config, prefix=f"{prefix}.attn")
+        self.attn = OAIAttention(config,
+                                 prefix=f"{prefix}.attn",
+                                 cache_config=cache_config)
         self.mlp = MLPBlock(config,
                             self.layer_idx,
                             quant_config=quant_config,
@@ -203,6 +206,7 @@ class GptOssModel(nn.Module):
     ):
         super().__init__()
         self.config = vllm_config.model_config.hf_config
+        self.cache_config = vllm_config.cache_config
         self.quant_config = vllm_config.quant_config
         self.parallel_config = vllm_config.parallel_config
         self.config.hidden_size = self.config.hidden_size
@@ -213,6 +217,7 @@ class GptOssModel(nn.Module):
         self.layers = torch.nn.ModuleList([
             TransformerBlock(
                 self.config,
+                cache_config=self.cache_config,
                 quant_config=self.quant_config,
                 prefix=maybe_prefix(prefix, f"block.{layer_idx}"),
             ) for layer_idx in range(self.config.num_hidden_layers)

From 786835807b491279af1fc5f565df9c6baedf3827 Mon Sep 17 00:00:00 2001
From: Yiheng Xu <charlesyihengxu@gmail.com>
Date: Wed, 27 Aug 2025 10:58:32 +0800
Subject: [PATCH 064/112] [Bugfix]: Qwen3 Coder Tool Parser (#23099)

Signed-off-by: Yiheng Xu <charlesyihengxu@gmail.com>
Co-authored-by: Aaron Pham <contact@aarnphm.xyz>
---
 examples/tool_chat_template_qwen3coder.jinja  | 117 ++++
 tests/tool_use/test_qwen3coder_tool_parser.py | 178 +++++-
 .../tool_parsers/qwen3coder_tool_parser.py    | 519 ++++++++++--------
 3 files changed, 571 insertions(+), 243 deletions(-)
 create mode 100644 examples/tool_chat_template_qwen3coder.jinja

diff --git a/examples/tool_chat_template_qwen3coder.jinja b/examples/tool_chat_template_qwen3coder.jinja
new file mode 100644
index 0000000000000..49b0e8d0ee7e6
--- /dev/null
+++ b/examples/tool_chat_template_qwen3coder.jinja
@@ -0,0 +1,117 @@
+{% macro render_extra_keys(json_dict, handled_keys) %}
+    {%- if json_dict is mapping %}
+        {%- for json_key in json_dict if json_key not in handled_keys %}
+            {%- if json_dict[json_key] is mapping or (json_dict[json_key] is sequence and json_dict[json_key] is not string) %}
+                {{- '\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | tojson | safe) ~ '</' ~ json_key ~ '>' }}
+            {%- else %}
+                {{-'\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | string) ~ '</' ~ json_key ~ '>' }}
+            {%- endif %}
+        {%- endfor %}
+    {%- endif %}
+{% endmacro %}
+
+{%- if messages[0]["role"] == "system" %}
+    {%- set system_message = messages[0]["content"] %}
+    {%- set loop_messages = messages[1:] %}
+{%- else %}
+    {%- set loop_messages = messages %}
+{%- endif %}
+
+{%- if not tools is defined %}
+    {%- set tools = [] %}
+{%- endif %}
+
+{%- if system_message is defined %}
+    {{- "<|im_start|>system\n" + system_message }}
+{%- else %}
+    {%- if tools is iterable and tools | length > 0 %}
+        {{- "<|im_start|>system\nYou are Qwen, a helpful AI assistant that can interact with a computer to solve tasks." }}
+    {%- endif %}
+{%- endif %}
+{%- if tools is iterable and tools | length > 0 %}
+    {{- "\n\n# Tools\n\nYou have access to the following functions:\n\n" }}
+    {{- "<tools>" }}
+    {%- for tool in tools %}
+        {%- if tool.function is defined %}
+            {%- set tool = tool.function %}
+        {%- endif %}
+        {{- "\n<function>\n<name>" ~ tool.name ~ "</name>" }}
+        {%- if tool.description is defined %}
+            {{- '\n<description>' ~ (tool.description | trim) ~ '</description>' }}
+        {%- endif %}
+        {{- '\n<parameters>' }}
+        {%- if tool.parameters is defined and tool.parameters is mapping and tool.parameters.properties is defined and tool.parameters.properties is mapping %}
+            {%- for param_name, param_fields in tool.parameters.properties|items %}
+                {{- '\n<parameter>' }}
+                {{- '\n<name>' ~ param_name ~ '</name>' }}
+                {%- if param_fields.type is defined %}
+                    {{- '\n<type>' ~ (param_fields.type | string) ~ '</type>' }}
+                {%- endif %}
+                {%- if param_fields.description is defined %}
+                    {{- '\n<description>' ~ (param_fields.description | trim) ~ '</description>' }}
+                {%- endif %}
+                {%- set handled_keys = ['name', 'type', 'description'] %}
+                {{- render_extra_keys(param_fields, handled_keys) }}
+                {{- '\n</parameter>' }}
+            {%- endfor %}
+        {%- endif %}
+        {% set handled_keys = ['type', 'properties'] %}
+        {{- render_extra_keys(tool.parameters, handled_keys) }}
+        {{- '\n</parameters>' }}
+        {%- set handled_keys = ['type', 'name', 'description', 'parameters'] %}
+        {{- render_extra_keys(tool, handled_keys) }}
+        {{- '\n</function>' }}
+    {%- endfor %}
+    {{- "\n</tools>" }}
+    {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' }}
+{%- endif %}
+{%- if system_message is defined %}
+    {{- '<|im_end|>\n' }}
+{%- else %}
+    {%- if tools is iterable and tools | length > 0 %}
+        {{- '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in loop_messages %}
+    {%- if message.role == "assistant" and message.tool_calls is defined and message.tool_calls is iterable and message.tool_calls | length > 0 %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content is defined and message.content is string and message.content | trim | length > 0 %}
+            {{- '\n' + message.content | trim + '\n' }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
+            {%- if tool_call.arguments is defined %}
+                {%- for args_name, args_value in tool_call.arguments|items %}
+                    {{- '<parameter=' + args_name + '>\n' }}
+                    {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}
+                    {{- args_value }}
+                    {{- '\n</parameter>\n' }}
+                {%- endfor %}
+            {%- endif %}
+            {{- '</function>\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "user" or message.role == "system" or message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.previtem and loop.previtem.role != "tool" %}
+            {{- '<|im_start|>user\n' }}
+        {%- endif %}
+        {{- '<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>\n' }}
+        {%- if not loop.last and loop.nextitem.role != "tool" %}
+            {{- '<|im_end|>\n' }}
+        {%- elif loop.last %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- else %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}
diff --git a/tests/tool_use/test_qwen3coder_tool_parser.py b/tests/tool_use/test_qwen3coder_tool_parser.py
index 40c3158e9e683..ccb2acf512caf 100644
--- a/tests/tool_use/test_qwen3coder_tool_parser.py
+++ b/tests/tool_use/test_qwen3coder_tool_parser.py
@@ -16,7 +16,7 @@ from vllm.entrypoints.openai.tool_parsers.qwen3coder_tool_parser import (
 from vllm.transformers_utils.detokenizer import detokenize_incrementally
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
 
-MODEL = "Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8"
+MODEL = "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"
 
 
 @pytest.fixture(scope="module")
@@ -397,7 +397,9 @@ hello world
         "no_tools",
         "single_tool",
         "single_tool_with_content",
+        "single_tool_multiline_param",
         "parallel_tools",
+        "tool_with_typed_params",  # Added this test case
     ],
     argnames=["model_output", "expected_tool_calls", "expected_content"],
     argvalues=[
@@ -422,7 +424,7 @@ fahrenheit
                                           "state": "TX",
                                           "unit": "fahrenheit"
                                       })))
-        ], ""),
+        ], None),
         ('''Sure! Let me check the weather for you.<tool_call>
 <function=get_current_weather>
 <parameter=city>
@@ -445,6 +447,30 @@ fahrenheit
                                       })))
         ], "Sure! Let me check the weather for you."),
         ('''<tool_call>
+<function=calculate_area>
+<parameter=shape>
+rectangle
+</parameter>
+<parameter=dimensions>
+{"width": 10, 
+ "height": 20}
+</parameter>
+<parameter=precision>
+2
+</parameter>
+</function>
+</tool_call>''', [
+            ToolCall(function=FunctionCall(name="calculate_area",
+                                           arguments=json.dumps({
+                                               "shape": "rectangle",
+                                               "dimensions": {
+                                                   "width": 10,
+                                                   "height": 20
+                                               },
+                                               "precision": 2
+                                           })))
+        ], None),
+        ('''<tool_call>
 <function=get_current_weather>
 <parameter=city>
 Dallas
@@ -484,13 +510,36 @@ celsius
                                           "state": "FL",
                                           "unit": "celsius"
                                       })))
-        ], ""),
+        ], None),
+        # Added tool_with_typed_params test case
+        ('''Let me calculate that area for you.<tool_call>
+<function=calculate_area>
+<parameter=shape>
+circle
+</parameter>
+<parameter=dimensions>
+{"radius": 15.5}
+</parameter>
+<parameter=precision>
+3
+</parameter>
+</function>
+</tool_call>''', [
+            ToolCall(function=FunctionCall(name="calculate_area",
+                                           arguments=json.dumps({
+                                               "shape": "circle",
+                                               "dimensions": {
+                                                   "radius": 15.5
+                                               },
+                                               "precision": 3
+                                           })))
+        ], "Let me calculate that area for you."),
     ],
 )
 def test_extract_tool_calls_streaming(qwen3_tool_parser, qwen3_tokenizer,
                                       sample_tools, model_output,
                                       expected_tool_calls, expected_content):
-    """Test incremental streaming behavior"""
+    """Test incremental streaming behavior including typed parameters"""
     request = ChatCompletionRequest(model=MODEL,
                                     messages=[],
                                     tools=sample_tools)
@@ -539,7 +588,7 @@ def test_extract_tool_calls_streaming(qwen3_tool_parser, qwen3_tokenizer,
                             "arguments"] += tool_call.function.arguments
 
     # Verify final content
-    assert other_content == expected_content
+    assert other_content == (expected_content or "")  # Handle None case
 
     # Verify we got all expected tool calls
     assert len(tool_states) == len(expected_tool_calls)
@@ -559,6 +608,125 @@ def test_extract_tool_calls_streaming(qwen3_tool_parser, qwen3_tokenizer,
         assert actual_args == expected_args
 
 
+def test_extract_tool_calls_missing_closing_parameter_tag(
+        qwen3_tool_parser, sample_tools):
+    """Test handling of missing closing </parameter> tag"""
+    # Using get_current_weather from sample_tools but with malformed XML
+    model_output = '''Let me check the weather for you:
+<tool_call>
+<function=get_current_weather>
+<parameter=city>
+Dallas
+<parameter=state>
+TX
+</parameter>
+<parameter=unit>
+fahrenheit
+</parameter>
+</function>
+</tool_call>'''
+
+    request = ChatCompletionRequest(model=MODEL,
+                                    messages=[],
+                                    tools=sample_tools)
+    extracted_tool_calls = qwen3_tool_parser.extract_tool_calls(
+        model_output, request=request)
+
+    # The parser should handle the malformed XML gracefully
+    assert extracted_tool_calls.tools_called
+    assert len(extracted_tool_calls.tool_calls) == 1
+
+    # Verify the function name is correct
+    assert extracted_tool_calls.tool_calls[
+        0].function.name == "get_current_weather"
+
+    # Verify the arguments are parsed despite the missing closing tag
+    args = json.loads(extracted_tool_calls.tool_calls[0].function.arguments)
+    assert "city" in args
+    assert args["city"] == "Dallas"
+    assert args["state"] == "TX"
+    assert args["unit"] == "fahrenheit"
+
+    # Check that content before the tool call is preserved
+    assert "Let me check the weather for you:" in extracted_tool_calls.content
+
+
+def test_extract_tool_calls_streaming_missing_closing_tag(
+        qwen3_tool_parser, qwen3_tokenizer, sample_tools):
+    """Test streaming with missing closing </parameter> tag"""
+    # Using get_current_weather from sample_tools but with malformed XML
+    model_output = '''Let me check the weather for you:
+<tool_call>
+<function=get_current_weather>
+<parameter=city>
+Dallas
+<parameter=state>
+TX
+</parameter>
+<parameter=unit>
+fahrenheit
+</parameter>
+</function>
+</tool_call>'''
+
+    request = ChatCompletionRequest(model=MODEL,
+                                    messages=[],
+                                    tools=sample_tools)
+
+    other_content = ''
+    tool_states = {}
+
+    for delta_message in stream_delta_message_generator(
+            qwen3_tool_parser, qwen3_tokenizer, model_output, request):
+
+        if delta_message.content:
+            other_content += delta_message.content
+
+        if delta_message.tool_calls:
+            for tool_call in delta_message.tool_calls:
+                idx = tool_call.index
+
+                if idx not in tool_states:
+                    tool_states[idx] = {
+                        "id": None,
+                        "name": None,
+                        "arguments": "",
+                        "type": None
+                    }
+
+                if tool_call.id:
+                    tool_states[idx]["id"] = tool_call.id
+
+                if tool_call.type:
+                    assert tool_call.type == "function"
+                    tool_states[idx]["type"] = tool_call.type
+
+                if tool_call.function:
+                    if tool_call.function.name:
+                        tool_states[idx]["name"] = tool_call.function.name
+
+                    if tool_call.function.arguments is not None:
+                        tool_states[idx][
+                            "arguments"] += tool_call.function.arguments
+
+    # Verify content was streamed
+    assert "Let me check the weather for you:" in other_content
+
+    # Verify we got the tool call
+    assert len(tool_states) == 1
+    state = tool_states[0]
+    assert state["id"] is not None
+    assert state["type"] == "function"
+    assert state["name"] == "get_current_weather"
+
+    # Verify arguments were parsed correctly despite missing closing tag
+    assert state["arguments"] is not None
+    args = json.loads(state["arguments"])
+    assert args["city"] == "Dallas"
+    assert args["state"] == "TX"
+    assert args["unit"] == "fahrenheit"
+
+
 def test_extract_tool_calls_streaming_incremental(qwen3_tool_parser,
                                                   qwen3_tokenizer,
                                                   sample_tools):
diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
index 2501d6739e8f6..955813ddd3408 100644
--- a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
+import ast
 import json
 import uuid
 from collections.abc import Sequence
@@ -22,7 +22,7 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module(["qwen3_coder"])
+@ToolParserManager.register_module("qwen3_coder")
 class Qwen3CoderToolParser(ToolParser):
 
     def __init__(self, tokenizer: AnyTokenizer):
@@ -30,6 +30,8 @@ class Qwen3CoderToolParser(ToolParser):
 
         self.current_tool_name_sent: bool = False
         self.prev_tool_call_arr: list[dict] = []
+        # Override base class type - we use string IDs for tool calls
+        self.current_tool_id: Optional[str] = None  # type: ignore
         self.streamed_args_for_tool: list[str] = []
 
         # Sentinel tokens for streaming mode
@@ -42,20 +44,6 @@ class Qwen3CoderToolParser(ToolParser):
         self.is_tool_call_started: bool = False
         self.failed_count: int = 0
 
-        # Streaming state variables
-        self.current_tool_index: int = 0
-        self.header_sent: bool = False
-        self.current_tool_string_id: Optional[str] = None
-        self.current_function_name: Optional[str] = None
-        self.current_param_name: Optional[str] = None
-        self.current_param_value: str = ""
-        self.param_count: int = 0
-        self.in_param: bool = False
-        self.in_function: bool = False
-        self.accumulated_text: str = ""
-        self.json_started: bool = False
-        self.json_closed: bool = False
-
         # Enhanced streaming state - reset for each new message
         self._reset_streaming_state()
 
@@ -67,7 +55,8 @@ class Qwen3CoderToolParser(ToolParser):
         self.tool_call_function_regex = re.compile(
             r"<function=(.*?)</function>|<function=(.*)$", re.DOTALL)
         self.tool_call_parameter_regex = re.compile(
-            r"<parameter=(.*?)</parameter>|<parameter=(.*?)$", re.DOTALL)
+            r"<parameter=(.*?)(?:</parameter>|(?=<parameter=)|(?=</function>)|$)",
+            re.DOTALL)
 
         if not self.model_tokenizer:
             raise ValueError(
@@ -84,8 +73,8 @@ class Qwen3CoderToolParser(ToolParser):
                 "Qwen3 XML Tool parser could not locate tool call start/end "
                 "tokens in the tokenizer!")
 
-        logger.debug("vLLM Successfully import tool parser %s !",
-                     self.__class__.__name__)
+        logger.info("vLLM Successfully import tool parser %s !",
+                    self.__class__.__name__)
 
     def _generate_tool_call_id(self) -> str:
         """Generate a unique tool call ID."""
@@ -96,7 +85,7 @@ class Qwen3CoderToolParser(ToolParser):
         self.current_tool_index = 0
         self.is_tool_call_started = False
         self.header_sent = False
-        self.current_tool_string_id = None
+        self.current_tool_id = None
         self.current_function_name = None
         self.current_param_name = None
         self.current_param_value = ""
@@ -106,122 +95,122 @@ class Qwen3CoderToolParser(ToolParser):
         self.accumulated_text = ""
         self.json_started = False
         self.json_closed = False
+        # Store accumulated parameters for type conversion
+        self.accumulated_params = {}
+        self.streaming_request = None
+
+    def _get_arguments_config(
+            self, func_name: str,
+            tools: Optional[list[ChatCompletionToolsParam]]) -> dict:
+        """Extract argument configuration for a function."""
+        if tools is None:
+            return {}
+        for config in tools:
+            if not hasattr(config, "type") or not (hasattr(
+                    config, "function") and hasattr(config.function, "name")):
+                continue
+            if config.type == "function" and config.function.name == func_name:
+                if not hasattr(config.function, "parameters"):
+                    return {}
+                params = config.function.parameters
+                if isinstance(params, dict) and "properties" in params:
+                    return params["properties"]
+                elif isinstance(params, dict):
+                    return params
+                else:
+                    return {}
+        logger.warning("Tool '%s' is not defined in the tools list.",
+                       func_name)
+        return {}
+
+    def _convert_param_value(self, param_value: str, param_name: str,
+                             param_config: dict, func_name: str) -> Any:
+        """Convert parameter value based on its type in the schema."""
+        # Handle null value for any type
+        if param_value.lower() == "null":
+            return None
+
+        if param_name not in param_config:
+            if param_config != {}:
+                logger.warning(
+                    "Parsed parameter '%s' is not defined in the tool "
+                    "parameters for tool '%s', directly returning the "
+                    "string value.", param_name, func_name)
+            return param_value
+
+        if isinstance(param_config[param_name],
+                      dict) and "type" in param_config[param_name]:
+            param_type = str(param_config[param_name]["type"]).strip().lower()
+        else:
+            param_type = "string"
+        if param_type in ["string", "str", "text", "varchar", "char", "enum"]:
+            return param_value
+        elif param_type.startswith("int") or param_type.startswith(
+                "uint") or param_type.startswith(
+                    "long") or param_type.startswith(
+                        "short") or param_type.startswith("unsigned"):
+            try:
+                return int(param_value)
+            except (ValueError, TypeError):
+                logger.warning(
+                    "Parsed value '%s' of parameter '%s' is not an "
+                    "integer in tool '%s', degenerating to string.",
+                    param_value, param_name, func_name)
+                return param_value
+        elif param_type.startswith("num") or param_type.startswith("float"):
+            try:
+                float_param_value = float(param_value)
+                return float_param_value if float_param_value - int(
+                    float_param_value) != 0 else int(float_param_value)
+            except (ValueError, TypeError):
+                logger.warning(
+                    "Parsed value '%s' of parameter '%s' is not a float "
+                    "in tool '%s', degenerating to string.", param_value,
+                    param_name, func_name)
+                return param_value
+        elif param_type in ["boolean", "bool", "binary"]:
+            param_value = param_value.lower()
+            if param_value not in ["true", "false"]:
+                logger.warning(
+                    "Parsed value '%s' of parameter '%s' is not a boolean "
+                    "(`true` or `false`) in tool '%s', degenerating to "
+                    "false.", param_value, param_name, func_name)
+            return param_value == "true"
+        else:
+            if param_type in ["object", "array", "arr"
+                              ] or param_type.startswith(
+                                  "dict") or param_type.startswith("list"):
+                try:
+                    param_value = json.loads(param_value)
+                    return param_value
+                except (json.JSONDecodeError, TypeError, ValueError):
+                    logger.warning(
+                        "Parsed value '%s' of parameter '%s' cannot be "
+                        "parsed with json.loads in tool '%s', will try "
+                        "other methods to parse it.", param_value, param_name,
+                        func_name)
+            try:
+                param_value = ast.literal_eval(param_value)  # safer
+            except (ValueError, SyntaxError, TypeError):
+                logger.warning(
+                    "Parsed value '%s' of parameter '%s' cannot be "
+                    "converted via Python `ast.literal_eval()` in tool "
+                    "'%s', degenerating to string.", param_value, param_name,
+                    func_name)
+            return param_value
 
     def _parse_xml_function_call(
             self, function_call_str: str,
             tools: Optional[list[ChatCompletionToolsParam]]
     ) -> Optional[ToolCall]:
 
-        def get_arguments_config(func_name: str) -> dict:
-            if tools is None:
-                return {}
-            for config in tools:
-                if not hasattr(config, "type") or not (
-                        hasattr(config, "function")
-                        and hasattr(config.function, "name")):
-                    continue
-                if (config.type == "function"
-                        and config.function.name == func_name):
-                    if not hasattr(config.function, "parameters"):
-                        return {}
-                    params = config.function.parameters
-                    if isinstance(params, dict) and "properties" in params:
-                        return params["properties"]
-                    elif isinstance(params, dict):
-                        return params
-                    else:
-                        return {}
-            logger.warning("Tool '%s' is not defined in the tools list.",
-                           func_name)
-            return {}
-
-        def convert_param_value(param_value: str, param_name: str,
-                                param_config: dict, func_name: str) -> Any:
-            # Handle null value for any type
-            if param_value.lower() == "null":
-                return None
-
-            converted_value: Any
-
-            if param_name not in param_config:
-                if param_config != {}:
-                    logger.warning(
-                        "Parsed parameter '%s' is not defined in the tool "
-                        "parameters for tool '%s', directly returning the "
-                        "string value.", param_name, func_name)
-                return param_value
-
-            if (isinstance(param_config[param_name], dict)
-                    and "type" in param_config[param_name]):
-                param_type = str(
-                    param_config[param_name]["type"]).strip().lower()
-            else:
-                param_type = "string"
-            if param_type in [
-                    "string", "str", "text", "varchar", "char", "enum"
-            ]:
-                return param_value
-            elif (param_type.startswith("int") or param_type.startswith("uint")
-                  or param_type.startswith("long")
-                  or param_type.startswith("short")
-                  or param_type.startswith("unsigned")):
-                try:
-                    converted_value = int(param_value)
-                    return converted_value
-                except ValueError:
-                    logger.warning(
-                        "Parsed value '%s' of parameter '%s' is not an "
-                        "integer in tool '%s', degenerating to string.",
-                        param_value, param_name, func_name)
-                return param_value
-            elif (param_type.startswith("num")
-                  or param_type.startswith("float")):
-                try:
-                    float_param_value = float(param_value)
-                    converted_value = (float_param_value if float_param_value -
-                                       int(float_param_value) != 0 else
-                                       int(float_param_value))
-                    return converted_value
-                except ValueError:
-                    logger.warning(
-                        "Parsed value '%s' of parameter '%s' is not a float "
-                        "in tool '%s', degenerating to string.", param_value,
-                        param_name, func_name)
-                return param_value
-            elif param_type in ["boolean", "bool", "binary"]:
-                param_value = param_value.lower()
-                if param_value not in ["true", "false"]:
-                    logger.warning(
-                        "Parsed value '%s' of parameter '%s' is not a "
-                        "boolean (`true` of `false`) in tool '%s', "
-                        "degenerating to false.", param_value, param_name,
-                        func_name)
-                return param_value == "true"
-            else:
-                if param_type == "object" or param_type.startswith("dict"):
-                    try:
-                        converted_value = json.loads(param_value)
-                        return converted_value
-                    except json.JSONDecodeError:
-                        logger.warning(
-                            "Parsed value '%s' of parameter '%s' is not a "
-                            "valid JSON object in tool '%s', will try other "
-                            "methods to parse it.", param_value, param_name,
-                            func_name)
-                logger.warning(
-                    "Parameter '%s' has unknown type '%s'. "
-                    "The value will be treated as a string.", param_name,
-                    param_type)
-                return param_value
-
         # Extract function name
         end_index = function_call_str.index(">")
         function_name = function_call_str[:end_index]
-        param_config = get_arguments_config(function_name)
+        param_config = self._get_arguments_config(function_name, tools)
         parameters = function_call_str[end_index + 1:]
         param_dict = {}
-        for match in self.tool_call_parameter_regex.findall(parameters):
-            match_text = match[0] if match[0] else match[1]
+        for match_text in self.tool_call_parameter_regex.findall(parameters):
             idx = match_text.index(">")
             param_name = match_text[:idx]
             param_value = str(match_text[idx + 1:])
@@ -231,7 +220,7 @@ class Qwen3CoderToolParser(ToolParser):
             if param_value.endswith("\n"):
                 param_value = param_value[:-1]
 
-            param_dict[param_name] = convert_param_value(
+            param_dict[param_name] = self._convert_param_value(
                 param_value, param_name, param_config, function_name)
         return ToolCall(
             type="function",
@@ -284,8 +273,7 @@ class Qwen3CoderToolParser(ToolParser):
                 for function_call_str in function_calls
             ]
 
-            # Populate prev_tool_call_arr for serving layer to set
-            # finish_reason
+            # Populate prev_tool_call_arr for serving layer to set finish_reason
             self.prev_tool_call_arr.clear()  # Clear previous calls
             for tool_call in tool_calls:
                 if tool_call:
@@ -298,8 +286,8 @@ class Qwen3CoderToolParser(ToolParser):
 
             # Extract content before tool calls
             content_index = model_output.find(self.tool_call_start_token)
-            content_index = (content_index if content_index >= 0 else
-                             model_output.find(self.tool_call_prefix))
+            idx = model_output.find(self.tool_call_prefix)
+            content_index = content_index if content_index >= 0 else idx
             content = model_output[:content_index]  # .rstrip()
 
             return ExtractedToolCallInformation(
@@ -324,13 +312,16 @@ class Qwen3CoderToolParser(ToolParser):
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
     ) -> Union[DeltaMessage, None]:
-        # If no delta text, return None unless it's an EOS token after tool
-        # calls
+        # Store request for type conversion
+        if not previous_text:
+            self._reset_streaming_state()
+            self.streaming_request = request
+
+        # If no delta text, return None unless it's an EOS token after tools
         if not delta_text:
             # Check if this is an EOS token after all tool calls are complete
-            # We check for tool calls in the text even if is_tool_call_started
-            # is False because it might have been reset after processing all
-            # tools
+            # Check for tool calls in text even if is_tool_call_started
+            # is False (might have been reset after processing all tools)
             if (delta_token_ids
                     and self.tool_call_end_token_id not in delta_token_ids):
                 # Count complete tool calls
@@ -339,24 +330,19 @@ class Qwen3CoderToolParser(ToolParser):
 
                 # If we have completed tool calls and populated
                 # prev_tool_call_arr
-                if (complete_calls > 0 and len(self.prev_tool_call_arr) > 0):
+                if complete_calls > 0 and len(self.prev_tool_call_arr) > 0:
                     # Check if all tool calls are closed
-                    open_calls = (
-                        current_text.count(self.tool_call_start_token) -
-                        current_text.count(self.tool_call_end_token))
+                    open_calls = current_text.count(
+                        self.tool_call_start_token) - current_text.count(
+                            self.tool_call_end_token)
                     if open_calls == 0:
-                        # Return empty delta message to allow finish_reason
-                        # processing
+                        # Return empty delta for finish_reason processing
                         return DeltaMessage(content="")
                 elif not self.is_tool_call_started and current_text:
                     # This is a regular content response that's now complete
                     return DeltaMessage(content="")
             return None
 
-        # Check if this is the first call (reset state if needed)
-        if not previous_text:
-            self._reset_streaming_state()
-
         # Update accumulated text
         self.accumulated_text = current_text
 
@@ -371,11 +357,11 @@ class Qwen3CoderToolParser(ToolParser):
                 self.param_count = 0
                 self.json_started = False
                 self.json_closed = False
+                self.accumulated_params = {}
 
                 # Check if there are more tool calls
-                tool_starts_count = current_text.count(
-                    self.tool_call_start_token)
-                if self.current_tool_index >= tool_starts_count:
+                tool_starts = current_text.count(self.tool_call_start_token)
+                if self.current_tool_index >= tool_starts:
                     # No more tool calls
                     self.is_tool_call_started = False
                 # Continue processing next tool
@@ -412,20 +398,20 @@ class Qwen3CoderToolParser(ToolParser):
 
         # We're in a tool call, find the current tool call portion
         # Need to find the correct tool call based on current_tool_index
-        tool_starts: list[int] = []
+        tool_start_positions: list[int] = []
         idx = 0
         while True:
             idx = current_text.find(self.tool_call_start_token, idx)
             if idx == -1:
                 break
-            tool_starts.append(idx)
+            tool_start_positions.append(idx)
             idx += len(self.tool_call_start_token)
 
-        if self.current_tool_index >= len(tool_starts):
+        if self.current_tool_index >= len(tool_start_positions):
             # No more tool calls to process yet
             return None
 
-        tool_start_idx = tool_starts[self.current_tool_index]
+        tool_start_idx = tool_start_positions[self.current_tool_index]
         # Find where this tool call ends (or current position if not ended yet)
         tool_end_idx = current_text.find(self.tool_call_end_token,
                                          tool_start_idx)
@@ -438,19 +424,19 @@ class Qwen3CoderToolParser(ToolParser):
         # Looking for function header
         if not self.header_sent:
             if self.tool_call_prefix in tool_text:
-                func_start = (tool_text.find(self.tool_call_prefix) +
-                              len(self.tool_call_prefix))
+                func_start = tool_text.find(self.tool_call_prefix) + len(
+                    self.tool_call_prefix)
                 func_end = tool_text.find(">", func_start)
 
                 if func_end != -1:
                     # Found complete function name
                     self.current_function_name = tool_text[func_start:func_end]
-                    self.current_tool_string_id = self._generate_tool_call_id()
+                    self.current_tool_id = self._generate_tool_call_id()
                     self.header_sent = True
                     self.in_function = True
 
-                    # IMPORTANT: Add to prev_tool_call_arr immediately when we
-                    # detect a tool call. This ensures
+                    # IMPORTANT: Add to prev_tool_call_arr immediately when
+                    # we detect a tool call. This ensures
                     # finish_reason="tool_calls" even if parsing isn't complete
                     already_added = any(
                         tool.get("name") == self.current_function_name
@@ -466,7 +452,7 @@ class Qwen3CoderToolParser(ToolParser):
                     return DeltaMessage(tool_calls=[
                         DeltaToolCall(
                             index=self.current_tool_index,
-                            id=self.current_tool_string_id,
+                            id=self.current_tool_id,
                             function=DeltaFunctionCall(
                                 name=self.current_function_name, arguments=""),
                             type="function",
@@ -496,10 +482,11 @@ class Qwen3CoderToolParser(ToolParser):
                 # Close JSON
                 self.json_closed = True
 
-                # Extract the complete tool call to update prev_tool_call_arr
-                # with final arguments. Find the function content
-                func_start = (tool_text.find(self.tool_call_prefix) +
-                              len(self.tool_call_prefix))
+                # Extract complete tool call to update
+                # prev_tool_call_arr with final arguments
+                # Find the function content
+                func_start = tool_text.find(self.tool_call_prefix) + len(
+                    self.tool_call_prefix)
                 func_content_end = tool_text.find(self.function_end_token,
                                                   func_start)
                 if func_content_end != -1:
@@ -507,15 +494,17 @@ class Qwen3CoderToolParser(ToolParser):
                     # Parse to get the complete arguments
                     try:
                         parsed_tool = self._parse_xml_function_call(
-                            func_content, request.tools if request else None)
+                            func_content, self.streaming_request.tools
+                            if self.streaming_request else None)
                         if parsed_tool:
-                            # Update existing entry in prev_tool_call_arr with
-                            # complete arguments
+                            # Update existing entry in
+                            # prev_tool_call_arr with complete args
                             for i, tool in enumerate(self.prev_tool_call_arr):
-                                if (tool.get("name") ==
-                                        parsed_tool.function.name):
-                                    self.prev_tool_call_arr[i]["arguments"] = (
-                                        parsed_tool.function.arguments)
+                                if tool.get(
+                                        "name") == parsed_tool.function.name:
+                                    args = parsed_tool.function.arguments
+                                    self.prev_tool_call_arr[i][
+                                        "arguments"] = args
                                     break
                     except Exception:
                         pass  # Ignore parsing errors during streaming
@@ -530,73 +519,110 @@ class Qwen3CoderToolParser(ToolParser):
                 # Reset state for next tool
                 self.in_function = False
                 self.json_closed = True
+                self.accumulated_params = {}
 
                 return result
 
             # Look for parameters
-            # Count how many complete parameters we have processed
-            complete_params = tool_text.count(self.parameter_end_token)
+            # Find all parameter starts
+            param_starts = []
+            idx = 0
+            while True:
+                idx = tool_text.find(self.parameter_prefix, idx)
+                if idx == -1:
+                    break
+                param_starts.append(idx)
+                idx += len(self.parameter_prefix)
 
             # Check if we should start a new parameter
-            if not self.in_param and self.param_count < complete_params:
-                # Find the unprocessed parameter
-                # Count parameter starts
-                param_starts = []
-                idx = 0
-                while True:
-                    idx = tool_text.find(self.parameter_prefix, idx)
-                    if idx == -1:
-                        break
-                    param_starts.append(idx)
-                    idx += len(self.parameter_prefix)
+            if (not self.in_param and self.param_count < len(param_starts)
+                    and len(param_starts) > self.param_count):
+                # Process the next parameter
+                param_idx = param_starts[self.param_count]
+                param_start = param_idx + len(self.parameter_prefix)
+                remaining = tool_text[param_start:]
 
-                if len(param_starts) > self.param_count:
-                    # Process the next parameter
-                    param_idx = param_starts[self.param_count]
-                    param_start = param_idx + len(self.parameter_prefix)
-                    remaining = tool_text[param_start:]
+                if ">" in remaining:
+                    # We have the complete parameter name
+                    name_end = remaining.find(">")
+                    self.current_param_name = remaining[:name_end]
 
-                    if ">" in remaining:
-                        # We have the complete parameter name
-                        name_end = remaining.find(">")
-                        self.current_param_name = remaining[:name_end]
+                    # Find the parameter value
+                    value_start = param_start + name_end + 1
+                    value_text = tool_text[value_start:]
+                    if value_text.startswith("\n"):
+                        value_text = value_text[1:]
 
-                        # Find the parameter value
-                        value_start = param_start + name_end + 1
-                        value_text = tool_text[value_start:]
-                        if value_text.startswith("\n"):
-                            value_text = value_text[1:]
+                    # Find where this parameter ends
+                    param_end_idx = value_text.find(self.parameter_end_token)
+                    if param_end_idx == -1:
+                        # No closing tag, look for next parameter or
+                        # function end
+                        next_param_idx = value_text.find(self.parameter_prefix)
+                        func_end_idx = value_text.find(self.function_end_token)
 
-                        # Find where this parameter ends
-                        param_end_idx = value_text.find(
-                            self.parameter_end_token)
-                        if param_end_idx != -1:
-                            # Complete parameter found
-                            param_value = value_text[:param_end_idx]
-                            if param_value.endswith("\n"):
-                                param_value = param_value[:-1]
-
-                            # Build complete JSON fragment for this parameter
-                            if self.param_count == 0:
-                                json_fragment = (
-                                    '"' + self.current_param_name + '": "' +
-                                    json.dumps(param_value)[1:-1] + '"')
+                        if next_param_idx != -1 and (func_end_idx == -1
+                                                     or next_param_idx
+                                                     < func_end_idx):
+                            param_end_idx = next_param_idx
+                        elif func_end_idx != -1:
+                            param_end_idx = func_end_idx
+                        else:
+                            # Neither found, check if tool call is complete
+                            if self.tool_call_end_token in tool_text:
+                                # Tool call is complete, so parameter
+                                # must be complete too. Use all
+                                # remaining text before function end
+                                param_end_idx = len(value_text)
                             else:
-                                json_fragment = (
-                                    ', "' + self.current_param_name + '": "' +
-                                    json.dumps(param_value)[1:-1] + '"')
+                                # Still streaming, wait for more content
+                                return None
 
-                            self.param_count += 1
+                    if param_end_idx != -1:
+                        # Complete parameter found
+                        param_value = value_text[:param_end_idx]
+                        if param_value.endswith("\n"):
+                            param_value = param_value[:-1]
 
-                            return DeltaMessage(tool_calls=[
-                                DeltaToolCall(
-                                    index=self.current_tool_index,
-                                    function=DeltaFunctionCall(
-                                        arguments=json_fragment),
-                                )
-                            ])
+                        # Store raw value for later processing
+                        self.accumulated_params[
+                            self.current_param_name] = param_value
 
-            # Continue parameter value
+                        # Get parameter configuration for type conversion
+                        param_config = self._get_arguments_config(
+                            self.current_function_name or "",
+                            self.streaming_request.tools
+                            if self.streaming_request else None)
+
+                        # Convert param value to appropriate type
+                        converted_value = self._convert_param_value(
+                            param_value, self.current_param_name, param_config,
+                            self.current_function_name or "")
+
+                        # Build JSON fragment based on the converted type
+                        # Use json.dumps to properly serialize the value
+                        serialized_value = json.dumps(converted_value,
+                                                      ensure_ascii=False)
+
+                        if self.param_count == 0:
+                            json_fragment = (f'"{self.current_param_name}": '
+                                             f'{serialized_value}')
+                        else:
+                            json_fragment = (f', "{self.current_param_name}": '
+                                             f'{serialized_value}')
+
+                        self.param_count += 1
+
+                        return DeltaMessage(tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_index,
+                                function=DeltaFunctionCall(
+                                    arguments=json_fragment),
+                            )
+                        ])
+
+            # Continue parameter value - Not used in the current implementation
+            # since we process complete parameters above
             if self.in_param:
                 if self.parameter_end_token in delta_text:
                     # End of parameter
@@ -608,25 +634,42 @@ class Qwen3CoderToolParser(ToolParser):
                         gt_idx = value_chunk.find(">")
                         value_chunk = value_chunk[gt_idx + 1:]
 
-                    if (not self.current_param_value
-                            and value_chunk.startswith("\n")):
+                    if not self.current_param_value and value_chunk.startswith(
+                            "\n"):
                         value_chunk = value_chunk[1:]
 
-                    # Calculate incremental JSON
+                    # Store complete value
                     full_value = self.current_param_value + value_chunk
-                    prev_escaped = (json.dumps(self.current_param_value)[1:-1]
-                                    if self.current_param_value else "")
-                    full_escaped = json.dumps(full_value)[1:-1]
-                    delta_escaped = full_escaped[len(prev_escaped):]
+                    self.accumulated_params[
+                        self.current_param_name] = full_value
 
+                    # Get parameter configuration for type conversion
+                    param_config = self._get_arguments_config(
+                        self.current_function_name or "",
+                        self.streaming_request.tools
+                        if self.streaming_request else None)
+
+                    # Convert the parameter value to the appropriate type
+                    converted_value = self._convert_param_value(
+                        full_value, self.current_param_name or "",
+                        param_config, self.current_function_name or "")
+
+                    # Serialize the converted value
+                    serialized_value = json.dumps(converted_value,
+                                                  ensure_ascii=False)
+
+                    # Since we've been streaming the quoted version,
+                    # we need to close it properly
+                    # This is complex - for now just complete the value
                     self.in_param = False
                     self.current_param_value = ""
 
+                    # Just close the current parameter string
                     return DeltaMessage(tool_calls=[
                         DeltaToolCall(
                             index=self.current_tool_index,
                             function=DeltaFunctionCall(
-                                arguments=delta_escaped + '"'),
+                                arguments='"'),  # Close the string quote
                         )
                     ])
                 else:
@@ -638,18 +681,18 @@ class Qwen3CoderToolParser(ToolParser):
                         gt_idx = value_chunk.find(">")
                         value_chunk = value_chunk[gt_idx + 1:]
 
-                    if (not self.current_param_value
-                            and value_chunk.startswith("\n")):
+                    if not self.current_param_value and value_chunk.startswith(
+                            "\n"):
                         value_chunk = value_chunk[1:]
 
                     if value_chunk:
                         # Stream the escaped delta
-                        prev_escaped = (json.dumps(
-                            self.current_param_value)[1:-1]
-                                        if self.current_param_value else "")
+                        prev_escaped = json.dumps(
+                            self.current_param_value, ensure_ascii=False
+                        )[1:-1] if self.current_param_value else ""
                         self.current_param_value += value_chunk
-                        full_escaped = json.dumps(
-                            self.current_param_value)[1:-1]
+                        full_escaped = json.dumps(self.current_param_value,
+                                                  ensure_ascii=False)[1:-1]
                         delta_escaped = full_escaped[len(prev_escaped):]
 
                         if delta_escaped:
@@ -661,4 +704,4 @@ class Qwen3CoderToolParser(ToolParser):
                                 )
                             ])
 
-        return None
+        return None
\ No newline at end of file

From c905684cfeaee3b2be2c736eee473b2c6ae7f7bf Mon Sep 17 00:00:00 2001
From: Chenheli Hua <huachenheli@outlook.com>
Date: Tue, 26 Aug 2025 20:05:34 -0700
Subject: [PATCH 065/112] [Core] Asynchronous h2d in
 merge_multimodal_embeddings via pinned memory. (#23686)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
---
 vllm/model_executor/models/utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 6c27fedc61b17..11e098f1d7bdb 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -508,7 +508,9 @@ def merge_multimodal_embeddings(
     """
     if isinstance(placeholder_token_id, list):
         placeholder_token_id = torch.tensor(placeholder_token_id,
-                                            device=input_ids.device)
+                                            pin_memory=True).to(
+                                                device=input_ids.device,
+                                                non_blocking=True)
         return _merge_multimodal_embeddings(
             inputs_embeds,
             torch.isin(input_ids, placeholder_token_id),

From 644d57d53191b94d9e50a4765891c498790d924b Mon Sep 17 00:00:00 2001
From: CSWYF3634076 <58356743+CSWYF3634076@users.noreply.github.com>
Date: Wed, 27 Aug 2025 12:02:55 +0800
Subject: [PATCH 066/112] [Model] Add Ernie4.5 VL Model Support (#22514)

Signed-off-by: wangyafeng <wangyafeng@baidu.com>
---
 docs/models/supported_models.md               |    1 +
 examples/offline_inference/vision_language.py |   32 +
 requirements/test.in                          |    1 +
 requirements/test.txt                         |    3 +
 .../multimodal/processing/test_common.py      |    1 +
 tests/models/registry.py                      |    2 +
 .../rotary_embedding/ernie45_vl_rope.py       |   72 +
 .../layers/rotary_embedding/mrope.py          |  123 ++
 vllm/model_executor/models/ernie45_vl.py      | 1504 +++++++++++++++++
 vllm/model_executor/models/ernie45_vl_moe.py  |  723 ++++++++
 vllm/model_executor/models/registry.py        |    1 +
 11 files changed, 2463 insertions(+)
 create mode 100644 vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py
 create mode 100644 vllm/model_executor/models/ernie45_vl.py
 create mode 100644 vllm/model_executor/models/ernie45_vl_moe.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 74f3a9d1cdb56..19ce8c06724f4 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -616,6 +616,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I<sup>+</sup> | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ | ✅︎ |
 | `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ | ✅︎ |
 | `DonutForConditionalGeneration`<sup>^</sup> | Donut | T + I | `ByteDance/Dolphin`, `naver-clova-ix/donut-base-finetuned-docvqa`, etc. | | | |
+| `Ernie4_5_VLMoeForConditionalGeneration` | Ernie4.5-VL | T + I<sup>+</sup>/ V<sup>+</sup> | `baidu/ERNIE-4.5-VL-28B-A3B-PT`, `baidu/ERNIE-4.5-VL-424B-A47B-PT` | | ✅︎ | ✅︎ |
 | `Florence2ForConditionalGeneration` | Florence-2 | T + I | `microsoft/Florence-2-base`, `microsoft/Florence-2-large`, etc. | | | |
 | `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ | ✅︎ |
 | `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ |
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 8d97ba2668263..4e879666f61d7 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -173,6 +173,37 @@ def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# Ernie4.5-VL
+def run_ernie45_vl(questions: list[str], modality: str) -> ModelRequestData:
+    model_name = "baidu/ERNIE-4.5-VL-28B-A3B-PT"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=5,
+        limit_mm_per_prompt={modality: 1},
+        trust_remote_code=True,
+    )
+
+    if modality == "image":
+        placeholder = "Picture 1:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>"
+    elif modality == "video":
+        placeholder = "Video 1:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>"
+
+    prompts = [
+        (
+            f"<|begin_of_sentence|>User: {question}{placeholder}\n"
+            "Assistant: <think></think>"
+        )
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # Florence2
 def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -1602,6 +1633,7 @@ model_example_map = {
     "chameleon": run_chameleon,
     "command_a_vision": run_command_a_vision,
     "deepseek_vl_v2": run_deepseek_vl2,
+    "ernie45_vl": run_ernie45_vl,
     "florence2": run_florence2,
     "fuyu": run_fuyu,
     "gemma3": run_gemma3,
diff --git a/requirements/test.in b/requirements/test.in
index 098a9242bc3af..92c577c501632 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -54,3 +54,4 @@ runai-model-streamer-s3==0.11.0
 fastsafetensors>=0.1.10
 pydantic>=2.10 # 2.9 leads to error on python 3.10
 terratorch==1.1rc2 # required for PrithviMAE test
+decord==0.6.0
diff --git a/requirements/test.txt b/requirements/test.txt
index 8b872752d875c..0c27c9bb67e82 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -156,6 +156,8 @@ datasets==3.0.2
     #   mteb
 decorator==5.1.1
     # via librosa
+decord==0.6.0
+    # via -r requirements/test.in
 dill==0.3.8
     # via
     #   datasets
@@ -493,6 +495,7 @@ numpy==1.26.4
     #   contourpy
     #   cupy-cuda12x
     #   datasets
+    #   decord
     #   einx
     #   encodec
     #   evaluate
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 74ca10d32609a..6361cb9b5586a 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -272,6 +272,7 @@ def _test_processing_correctness_one(
     "CohereLabs/command-a-vision-07-2025",
     "deepseek-ai/deepseek-vl2-tiny",
     "naver-clova-ix/donut-base-finetuned-docvqa",
+    "baidu/ERNIE-4.5-VL-28B-A3B-PT",
     "microsoft/Florence-2-base",
     "adept/fuyu-8b",
     "google/gemma-3-4b-it",
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 20c7c3af67764..f2c09d3e8452a 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -396,6 +396,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                 transformers_version_reason="HF model is not compatible.",  # noqa: E501
                                                 hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}),  # noqa: E501
     "Emu3ForConditionalGeneration": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
+    "Ernie4_5_VLMoeForConditionalGeneration": _HfExamplesInfo("baidu/ERNIE-4.5-VL-28B-A3B-PT",  # noqa: E501
+                                                              trust_remote_code=True),
     "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
     "Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it"),
     "Gemma3nForConditionalGeneration": _HfExamplesInfo("google/gemma-3n-E2B-it",    # noqa: E501
diff --git a/vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py b/vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py
new file mode 100644
index 0000000000000..05322e56f2620
--- /dev/null
+++ b/vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import torch
+
+from .common import apply_rotary_emb_dispatch
+from .mrope import MRotaryEmbedding
+
+
+class Ernie4_5_VLRotaryEmbedding(MRotaryEmbedding):
+    """3D rotary positional embedding. 3D is t:time h:height w:width"""
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        assert positions.ndim == 1 or positions.ndim == 2
+        assert key is not None
+
+        num_tokens = positions.shape[-1]
+        cos_sin = self.cos_sin_cache[positions]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        if positions.ndim == 2:
+            assert self.mrope_section
+
+            section_h = self.mrope_section[0]  # 22
+            section_w = self.mrope_section[1]  # 22
+            section_t = self.mrope_section[2]  # 20
+            assert section_h == section_w
+            # Split according to [h w h w h w h w... t t t...]
+            section_cos_t = cos[..., -section_t:]
+            section_cos_h = cos[..., :section_h + section_w:2]
+            section_cos_w = cos[..., 1:section_h + section_w:2]
+
+            cos_t, cos_h, cos_w = section_cos_t[0], section_cos_h[
+                1], section_cos_w[2]
+            cos_hw = torch.stack([cos_h, cos_w],
+                                 dim=-1).reshape(cos_h.shape[:-1] +
+                                                 (cos_h.shape[-1] * 2, ))
+            cos = torch.cat([cos_hw, cos_t], dim=-1)
+
+            section_sin_t = sin[..., -section_t:]
+            section_sin_h = sin[..., :section_h + section_w:2]
+            section_sin_w = sin[..., 1:section_h + section_w:2]
+
+            sin_t, sin_h, sin_w = section_sin_t[0], section_sin_h[
+                1], section_sin_w[2]
+            sin_hw = torch.stack([sin_h, sin_w],
+                                 dim=-1).reshape(sin_h.shape[:-1] +
+                                                 (sin_h.shape[-1] * 2, ))
+            sin = torch.cat([sin_hw, sin_t], dim=-1)
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, self.head_size)
+        query_rot = query[..., :self.rotary_dim]
+        query_pass = query[..., self.rotary_dim:]
+        query_rot = apply_rotary_emb_dispatch(query_rot, cos, sin,
+                                              self.is_neox_style)
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        key_shape = key.shape
+        key = key.view(num_tokens, -1, self.head_size)
+        key_rot = key[..., :self.rotary_dim]
+        key_pass = key[..., self.rotary_dim:]
+        key_rot = apply_rotary_emb_dispatch(key_rot, cos, sin,
+                                            self.is_neox_style)
+        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py
index a091cfb743291..e374aa9bebf9e 100644
--- a/vllm/model_executor/layers/rotary_embedding/mrope.py
+++ b/vllm/model_executor/layers/rotary_embedding/mrope.py
@@ -393,6 +393,15 @@ class MRotaryEmbedding(RotaryEmbedding):
                 context_len=context_len,
                 seq_len=seq_len,
             )
+        elif hf_config.model_type in ["ernie4_5_moe_vl", "ernie4_5_vl"]:
+            return cls._ernie_get_input_positions_tensor(
+                input_tokens=input_tokens,
+                hf_config=hf_config,
+                image_grid_thw=image_grid_thw,
+                video_grid_thw=video_grid_thw,
+                context_len=context_len,
+                seq_len=seq_len,
+            )
         else:
             return cls._vl_get_input_positions_tensor(
                 input_tokens=input_tokens,
@@ -513,6 +522,120 @@ class MRotaryEmbedding(RotaryEmbedding):
                                 len(input_tokens)).item()
         return llm_positions, mrope_position_delta
 
+    @classmethod
+    def _ernie_get_input_positions_tensor(
+        cls,
+        input_tokens: list[int],
+        hf_config: PretrainedConfig,
+        image_grid_thw: Union[list[list[int]], torch.Tensor],
+        video_grid_thw: Union[list[list[int]], torch.Tensor],
+        context_len: int = 0,
+        seq_len: Optional[int] = None,
+    ) -> tuple[torch.Tensor, int]:
+        """Get mrope input positions and delta value for Ernie VL."""
+
+        image_token_id = hf_config.im_patch_id
+        video_start_token_id = hf_config.video_start_token_id
+        video_end_token_id = hf_config.video_end_token_id
+        spatial_conv_size = hf_config.spatial_conv_size
+        temporal_conv_size = hf_config.temporal_conv_size
+        llm_pos_ids_list: list = []
+
+        if not (image_grid_thw is None and video_grid_thw is None):
+            if isinstance(image_grid_thw, torch.Tensor):
+                image_grid_thw = image_grid_thw.tolist()
+
+            input_token_type: list[str] = []
+            video_check_flg = False
+            for token in input_tokens:
+                if token == video_start_token_id:
+                    video_check_flg = True
+                elif token == video_end_token_id:
+                    video_check_flg = False
+
+                if (token == image_token_id) and (video_check_flg is False):
+                    input_token_type.append("image")
+                elif (token == image_token_id) and (video_check_flg is True):
+                    input_token_type.append("video")
+                else:
+                    input_token_type.append("text")
+
+            input_type_group: list[tuple[str, int, int]] = []
+            for key, group_iter in itertools.groupby(
+                    enumerate(input_token_type), lambda x: x[1]):
+                group_list = list(group_iter)
+                start_index = group_list[0][0]
+                end_index = group_list[-1][0] + 1
+                input_type_group.append((key, start_index, end_index))
+
+            video_frame_num = 1
+            mm_data_idx = 0
+            for modality_type, start_idx, end_idx in input_type_group:
+                st_idx = llm_pos_ids_list[-1].max() + 1 if len(
+                    llm_pos_ids_list) > 0 else 0
+                if modality_type == "image":
+                    t, h, w = (
+                        image_grid_thw[mm_data_idx][0],
+                        image_grid_thw[mm_data_idx][1],
+                        image_grid_thw[mm_data_idx][2],
+                    )
+                    llm_grid_t, llm_grid_h, llm_grid_w = \
+                        t, h // spatial_conv_size, w // spatial_conv_size
+
+                    t_index = torch.arange(llm_grid_t).view(-1, 1).expand(
+                        -1, llm_grid_h * llm_grid_w).flatten()
+                    h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
+                        llm_grid_t, -1, llm_grid_w).flatten()
+                    w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
+                        llm_grid_t, llm_grid_h, -1).flatten()
+                    llm_pos_ids_list.append(
+                        torch.stack([t_index, h_index, w_index]) + st_idx)
+                    mm_data_idx += 1
+
+                elif modality_type == "video":
+                    t, h, w = (
+                        video_grid_thw[mm_data_idx][0],
+                        video_grid_thw[mm_data_idx][1],
+                        video_grid_thw[mm_data_idx][2],
+                    )
+                    llm_grid_t, llm_grid_h, llm_grid_w = (t //
+                                                          temporal_conv_size,
+                                                          h //
+                                                          spatial_conv_size,
+                                                          w //
+                                                          spatial_conv_size)
+
+                    for t_idx in range(llm_grid_t):
+                        t_index = torch.tensor(t_idx).view(-1, 1).expand(
+                            -1, llm_grid_h * llm_grid_w).flatten()
+                        h_index = torch.arange(llm_grid_h).view(
+                            1, -1, 1).expand(1, -1, llm_grid_w).flatten()
+                        w_index = torch.arange(llm_grid_w).view(
+                            1, 1, -1).expand(1, llm_grid_h, -1).flatten()
+                        llm_pos_ids_list.append(
+                            torch.stack([t_index, h_index, w_index]) + st_idx)
+
+                    mm_data_idx += 1
+                    video_frame_num += 1
+
+                else:
+                    text_len = end_idx - start_idx
+                    llm_pos_ids_list.append(
+                        torch.arange(text_len).view(1, -1).expand(3, -1) +
+                        st_idx)
+                    video_frame_num = 1
+
+        else:
+            text_len = len(input_tokens)
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1))
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+        llm_positions = llm_positions[:, context_len:seq_len]
+        mrope_position_delta = (llm_positions.max() + 1 -
+                                len(input_tokens)).item()
+        return llm_positions, mrope_position_delta
+
     @classmethod
     def _vl_get_input_positions_tensor(
         cls,
diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py
new file mode 100644
index 0000000000000..d880fc434e20f
--- /dev/null
+++ b/vllm/model_executor/models/ernie45_vl.py
@@ -0,0 +1,1504 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2025 The Baidu team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Erine VL model compatible with HuggingFace weights."""
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from functools import partial
+from typing import Any, Callable, Literal, Optional, TypedDict, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from transformers import BatchFeature
+
+from vllm.config import VllmConfig
+from vllm.distributed import parallel_state
+from vllm.distributed import utils as dist_utils
+from vllm.logger import init_logger
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.layers.activation import QuickGELU
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargsItems)
+from vllm.multimodal.parse import ImageSize, MultiModalDataItems
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.platforms import _Backend, current_platform
+from vllm.sequence import IntermediateTensors
+
+from .ernie45_vl_moe import Ernie4_5_VLMoeForCausalLM
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP)
+from .utils import (AutoWeightsLoader, WeightsMapper, maybe_prefix,
+                    merge_multimodal_embeddings)
+from .vision import get_vit_attn_backend
+
+logger = init_logger(__name__)
+
+_MAX_FRAMES_PER_VIDEO = 16
+
+# === Vision Transformer === #
+
+
+def rotate_half(x: torch.Tensor, interleaved: bool = False) -> torch.Tensor:
+    if not interleaved:
+        x1, x2 = x.chunk(2, dim=-1)
+        return torch.cat((-x2, x1), dim=-1)
+    else:
+        x1, x2 = x[..., ::2], x[..., 1::2]
+        return rearrange(torch.stack((-x2, x1), dim=-1),
+                         "... d two -> ... (d two)",
+                         two=2)
+
+
+def apply_rotary_emb_torch(x: torch.Tensor,
+                           cos: torch.Tensor,
+                           sin: torch.Tensor,
+                           interleaved: bool = False) -> torch.Tensor:
+    """
+    x: (batch_size, seqlen, nheads, headdim)
+    cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
+    """
+    ro_dim = cos.shape[-1] * 2
+    assert ro_dim <= x.shape[-1]
+    cos = repeat(
+        cos,
+        "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
+    sin = repeat(
+        sin,
+        "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
+    return torch.cat(
+        [
+            x[..., :ro_dim] * cos +
+            rotate_half(x[..., :ro_dim], interleaved) * sin, x[..., ro_dim:]
+        ],
+        dim=-1,
+    )
+
+
+def apply_rotary_pos_emb_vision(t: torch.Tensor,
+                                freqs: torch.Tensor) -> torch.Tensor:
+    t_ = t.float()
+    cos = freqs.cos()
+    sin = freqs.sin()
+    apply_rotary_emb = apply_rotary_emb_torch
+    if current_platform.is_cuda():
+        from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
+    output = apply_rotary_emb(t_, cos, sin).type_as(t)
+    return output
+
+
+def all_gather_interleave(local_tensor, hidden_size: int, tp_size: int):
+    """All-gather the input tensor interleavely across model parallel group."""
+    import torch.distributed as dist
+    gathered_tensors = [torch.zeros_like(local_tensor) for _ in range(tp_size)]
+    dist.all_gather(gathered_tensors,
+                    local_tensor,
+                    group=parallel_state.get_tp_group().device_group)
+
+    gathered_tensors_split = [
+        torch.split(tensor, hidden_size // tp_size, -1)
+        for tensor in gathered_tensors
+    ]
+    ordered_tensors = [
+        tensor for pair in zip(*gathered_tensors_split) for tensor in pair
+    ]
+    result_tensor = torch.cat(ordered_tensors, dim=-1)
+    return result_tensor
+
+
+class Ernie4_5_VisionAttention(nn.Module):
+    """VisionAttention using VLLM framework APIs"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        projection_size: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        # Per attention head and per partition values.
+        self.tp_size = parallel_state.get_tensor_model_parallel_world_size()
+        self.tp_rank = parallel_state.get_tensor_model_parallel_rank()
+        self.hidden_size_per_attention_head = dist_utils.divide(
+            projection_size, num_heads)
+        self.num_attention_heads_per_partition = dist_utils.divide(
+            num_heads, self.tp_size)
+
+        self.qkv = QKVParallelLinear(
+            hidden_size=embed_dim,
+            head_size=self.hidden_size_per_attention_head,
+            total_num_heads=num_heads,
+            total_num_kv_heads=num_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv")
+        self.proj = RowParallelLinear(input_size=projection_size,
+                                      output_size=embed_dim,
+                                      quant_config=quant_config,
+                                      prefix=f"{prefix}.proj")
+
+        # Detect attention implementation.
+        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
+        if self.attn_backend not in {
+                _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS,
+                _Backend.ROCM_AITER_FA
+        }:
+            raise RuntimeError(
+                f"Ernie45-VL does not support {self.attn_backend} backend now."
+            )
+        self.is_flash_attn_backend = self.attn_backend in {
+            _Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA
+        }
+
+    def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
+        # [s, b, 3 * head * head_dim]
+        seq_len, bs, _ = qkv.shape
+        if self.tp_size > 1:
+            qkv = all_gather_interleave(qkv, self.qkv.hidden_size,
+                                        self.tp_size)
+
+        # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim]
+        q, k, v = qkv.chunk(3, dim=2)
+
+        # 3 * [s, b, head * head_dim]
+        if self.tp_size > 1:
+            splitter = partial(dist_utils.split_tensor_along_last_dim,
+                               num_partitions=self.tp_size)
+            q = splitter(q)[self.tp_rank]
+            k = splitter(k)[self.tp_rank]
+            v = splitter(v)[self.tp_rank]
+
+        # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim]
+        new_shape = (seq_len, bs, self.num_attention_heads_per_partition,
+                     self.hidden_size_per_attention_head)
+        q, k, v = (x.view(*new_shape) for x in (q, k, v))
+        return q, k, v
+
+    def forward(
+            self,
+            x: torch.Tensor,
+            cu_seqlens: torch.Tensor,
+            rotary_pos_emb: torch.Tensor,
+            max_seqlen: Optional[int] = None,  # Only used for Flash Attention
+            seqlens: Optional[list[int]] = None,  # Only used for xFormers
+    ) -> torch.Tensor:
+        # [s, b, c] --> [s, b, head * 3 * head_dim]
+        x, _ = self.qkv(x)
+
+        # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim]
+        q, k, v = self.split_qkv(x)
+        batch_size = q.shape[1]
+
+        q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous()
+                   for x in (q, k, v))
+        if rotary_pos_emb is not None:
+            q = apply_rotary_pos_emb_vision(q, rotary_pos_emb)
+            k = apply_rotary_pos_emb_vision(k, rotary_pos_emb)
+
+        if self.is_flash_attn_backend:
+            # from vllm_flash_attn.flash_attn_interface import (
+            #   flash_attn_varlen_func)
+            if self.attn_backend == _Backend.ROCM_AITER_FA:
+                from aiter import flash_attn_varlen_func
+            else:
+                from flash_attn import flash_attn_varlen_func
+
+            q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
+
+            output = flash_attn_varlen_func(q,
+                                            k,
+                                            v,
+                                            cu_seqlens_q=cu_seqlens,
+                                            cu_seqlens_k=cu_seqlens,
+                                            max_seqlen_q=max_seqlen,
+                                            max_seqlen_k=max_seqlen,
+                                            dropout_p=0.0,
+                                            causal=False)
+
+            context_layer = rearrange(output,
+                                      "(b s) ... -> b s ...",
+                                      b=batch_size)
+        elif self.attn_backend == _Backend.TORCH_SDPA:
+            # Execute attention entry by entry for speed & less VRAM.
+            outputs = []
+            for i in range(1, len(cu_seqlens)):
+                start_idx = cu_seqlens[i - 1]
+                end_idx = cu_seqlens[i]
+                q_i = q[:, start_idx:end_idx]
+                k_i = k[:, start_idx:end_idx]
+                v_i = v[:, start_idx:end_idx]
+                q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d")
+                                 for x in [q_i, k_i, v_i])
+                output_i = F.scaled_dot_product_attention(q_i,
+                                                          k_i,
+                                                          v_i,
+                                                          dropout_p=0.0)
+                output_i = rearrange(output_i, "b h s d -> b s h d ")
+                outputs.append(output_i)
+            context_layer = torch.cat(outputs, dim=1)
+        elif self.attn_backend == _Backend.XFORMERS:
+            from xformers import ops as xops
+            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
+
+            attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens,
+                                                       kv_seqlen=None,
+                                                       device=q.device)
+
+            context_layer = xops.memory_efficient_attention_forward(
+                q, k, v, attn_bias=attn_bias, p=0, scale=None)
+        context_layer = rearrange(context_layer,
+                                  "b s h d -> s b (h d)").contiguous()
+
+        output, _ = self.proj(context_layer)
+        return output
+
+
+class Ernie4_5_VisionMLP(nn.Module):
+
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int,
+        act_layer: type[nn.Module] = QuickGELU,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.fc1 = ColumnParallelLinear(in_features,
+                                        hidden_features,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.fc1")
+        self.act = act_layer()
+        self.fc2 = RowParallelLinear(hidden_features,
+                                     in_features,
+                                     quant_config=quant_config,
+                                     prefix=f"{prefix}.fc2")
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_parallel, _ = self.fc1(x)
+        x_parallel = self.act(x_parallel)
+        x, _ = self.fc2(x_parallel)
+        return x
+
+
+class Ernie4_5_VisionBlock(nn.Module):
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float,
+        act_layer: type[nn.Module] = QuickGELU,
+        norm_layer: Optional[Callable[[int], nn.Module]] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.norm1 = norm_layer(dim)
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+
+        self.attn = Ernie4_5_VisionAttention(embed_dim=dim,
+                                             num_heads=num_heads,
+                                             projection_size=dim,
+                                             quant_config=quant_config,
+                                             prefix=f"{prefix}.attn")
+
+        self.mlp = Ernie4_5_VisionMLP(dim,
+                                      mlp_hidden_dim,
+                                      act_layer=act_layer,
+                                      quant_config=quant_config,
+                                      prefix=f"{prefix}.mlp")
+
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            cu_seqlens: torch.Tensor,
+            rotary_pos_emb: torch.Tensor,
+            max_seqlen: Optional[int] = None,  # Only used for Flash Attention
+            seqlens: Optional[list[int]] = None,  # Only used for xFormers
+    ) -> torch.Tensor:
+
+        hidden_states = hidden_states + self.attn(
+            self.norm1(hidden_states),
+            cu_seqlens=cu_seqlens,
+            rotary_pos_emb=rotary_pos_emb,
+            max_seqlen=max_seqlen,
+            seqlens=seqlens,
+        )
+        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
+        return hidden_states
+
+
+class Ernie4_5_VisionPatchEmbed(nn.Module):
+
+    def __init__(
+        self,
+        patch_size: int = 14,
+        in_channels: int = 3,
+        embed_dim: int = 1280,
+        prefix="",
+    ) -> None:
+
+        super().__init__()
+        self.patch_size = patch_size
+        self.in_channels = in_channels
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Linear(in_channels * patch_size * patch_size,
+                              embed_dim,
+                              bias=False)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+
+        target_dtype = self.proj.weight.dtype
+        hidden_states = hidden_states.to(target_dtype)
+        hidden_states = self.proj(hidden_states)
+
+        return hidden_states
+
+
+class Ernie4_5_VisionRotaryEmbedding(nn.Module):
+
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        self.inv_freq = 1.0 / theta**(
+            torch.arange(start=0, end=dim, step=2, dtype=torch.float32) / dim)
+
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(seqlen,
+                           device=self.inv_freq.device,
+                           dtype=self.inv_freq.dtype)
+        freqs = torch.outer(input=seq, vec2=self.inv_freq)
+        return freqs
+
+
+class Ernie4_5_VisionTransformer(nn.Module):
+
+    def __init__(
+        self,
+        vision_config,
+        norm_eps: float = 1e-6,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+
+        super().__init__()
+        patch_size = vision_config.patch_size
+        spatial_merge_size = vision_config.spatial_merge_size
+        in_channels = vision_config.in_channels
+        hidden_size = vision_config.hidden_size
+        embed_dim = vision_config.embed_dim
+        depth = vision_config.depth
+        num_heads = vision_config.num_heads
+        mlp_ratio = vision_config.mlp_ratio
+
+        self.spatial_merge_size = spatial_merge_size
+        self.num_heads = num_heads
+        self.embed_dim = embed_dim
+
+        self.patch_embed = Ernie4_5_VisionPatchEmbed(
+            patch_size=patch_size,
+            in_channels=in_channels,
+            embed_dim=embed_dim,
+            prefix=f"{prefix}.patch_embed",
+        )
+
+        norm_layer = partial(nn.LayerNorm, eps=norm_eps)
+        head_dim = embed_dim // num_heads
+        self.rotary_pos_emb = Ernie4_5_VisionRotaryEmbedding(head_dim // 2)
+
+        self.blocks = nn.ModuleList([
+            Ernie4_5_VisionBlock(dim=embed_dim,
+                                 num_heads=num_heads,
+                                 mlp_ratio=mlp_ratio,
+                                 norm_layer=norm_layer,
+                                 quant_config=quant_config,
+                                 prefix=f"{prefix}.blocks.{layer_idx}")
+            for layer_idx in range(depth)
+        ])
+
+        assert (hidden_size == embed_dim
+                ), "vit's config.hidden must be equal to config.embed_dim"
+        self.ln = nn.LayerNorm(hidden_size, eps=1e-6)
+
+        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.patch_embed.proj.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.patch_embed.proj.weight.device
+
+    def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            hpos_ids = hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            ).permute(0, 2, 1, 3).flatten()
+            wpos_ids = wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            ).permute(0, 2, 1, 3).flatten()
+            pos_ids.append(
+                torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+
+    def compute_attn_mask_seqlen(
+            self, cu_seqlens: torch.Tensor
+    ) -> tuple[Optional[int], Optional[list[int]]]:
+        max_seqlen, seqlens = None, None
+        if self.attn_backend == _Backend.FLASH_ATTN:
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        elif self.attn_backend == _Backend.XFORMERS:
+            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+        return max_seqlen, seqlens
+
+    def forward(self,
+                hidden_states: torch.Tensor,
+                grid_thw: torch.Tensor,
+                num_pad=0) -> torch.Tensor:
+
+        hidden_states = self.patch_embed(hidden_states)
+
+        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+        rotary_pos_emb = rotary_pos_emb.to(hidden_states.device)
+
+        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2],
+                                             grid_thw[:, 0]).cumsum(
+                                                 dim=0, dtype=torch.int32)
+
+        if num_pad > 0:
+            cu_seqlens = F.pad(cu_seqlens, (1, 1), value=0)
+            cu_seqlens[-1] = cu_seqlens[-2] + num_pad
+        else:
+            cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+
+        # add batch size
+        if hidden_states.ndim == 2:
+            hidden_states = hidden_states.unsqueeze(dim=1)
+
+        # pre-compute seqlens for attn mask to reduce cuMemcpy operations
+        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+
+        for i, blk in enumerate(self.blocks):
+            hidden_states = blk(
+                hidden_states,
+                cu_seqlens=cu_seqlens,
+                rotary_pos_emb=rotary_pos_emb,
+                max_seqlen=max_seqlen,
+                seqlens=seqlens,
+            )
+
+        final_output = self.ln(hidden_states)
+
+        if final_output.ndim == 3:
+            final_output = final_output.squeeze(dim=1)
+
+        return final_output
+
+    def load_weights(self, weights) -> set[str]:
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+# === Vision Inputs === #
+
+
+class Ernie4_5_VLImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values: torch.Tensor
+    """Shape:
+    `(num_patches, num_channels * patch_size * patch_size)`
+    """
+
+    grid_thw: torch.Tensor
+    """Shape: `(num_images, 3)`
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+Ernie4_5_VLImageInputs = Ernie4_5_VLImagePixelInputs
+
+
+class Ernie4_5_VLVideoPixelInputs(TypedDict):
+    type: Literal["pixel_values_videos"]
+    pixel_values_videos: torch.Tensor
+    """Shape:
+    `(num_patches,
+      num_channels * temporal_patch_size * patch_size * patch_size)`
+    """
+
+    video_grid_thw: torch.Tensor
+    """Shape: `(num_videos, 3)`
+
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+Ernie4_5_VLVideoInputs = Ernie4_5_VLImagePixelInputs
+
+# === Vision Processor === #
+
+
+def round_by_factor(number: Union[int, float], factor: int) -> int:
+    return round(number / factor) * factor
+
+
+def ceil_by_factor(number: Union[int, float], factor: int) -> int:
+    return math.ceil(number / factor) * factor
+
+
+def floor_by_factor(number: Union[int, float], factor: int) -> int:
+    return math.floor(number / factor) * factor
+
+
+def smart_resize(
+    height: int,
+    width: int,
+    factor: int = 28,
+    min_pixels: int = 4 * 28 * 28,
+    max_pixels: int = 16384 * 28 * 28,
+):
+    MAX_RATIO = 200
+    if max(height, width) / min(height, width) > MAX_RATIO:
+        if height > width:
+            new_width = max(factor, round_by_factor(width, factor))
+            new_height = floor_by_factor(new_width * MAX_RATIO, factor)
+        else:
+            new_height = max(factor, round_by_factor(height, factor))
+            new_width = floor_by_factor(new_height * MAX_RATIO, factor)
+
+        height = new_height
+        width = new_width
+
+    h_bar = max(factor, round_by_factor(height, factor))
+    w_bar = max(factor, round_by_factor(width, factor))
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = floor_by_factor(height / beta, factor)
+        w_bar = floor_by_factor(width / beta, factor)
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = ceil_by_factor(height * beta, factor)
+        w_bar = ceil_by_factor(width * beta, factor)
+
+    if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels:
+        raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}")
+
+    return h_bar, w_bar
+
+
+class VariableResolutionResamplerModel(nn.Module):
+
+    def __init__(self,
+                 in_dim,
+                 out_dim,
+                 spatial_conv_size,
+                 temporal_conv_size,
+                 config,
+                 prefix: str = "") -> None:
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.config = config
+        self.spatial_conv_size = spatial_conv_size
+        self.temporal_conv_size = temporal_conv_size
+        self.use_temporal_conv = config.use_temporal_conv
+
+        # compress 2d conv(picture) to 1d
+        self.spatial_dim = (self.in_dim * self.spatial_conv_size *
+                            self.spatial_conv_size)
+        # compress 3d conv(video) to 1d
+        self.temporal_dim = (self.in_dim * self.spatial_conv_size *
+                             self.spatial_conv_size * self.temporal_conv_size)
+
+        self.spatial_linear1 = ColumnParallelLinear(
+            self.spatial_dim,
+            self.spatial_dim,
+            bias=True,
+            gather_output=True,
+            quant_config=getattr(config, 'quant_config', None),
+            prefix=f"{prefix}.spatial_linear1",
+        )
+
+        self.spatial_gelu = nn.GELU()
+
+        self.spatial_linear2 = ColumnParallelLinear(
+            self.spatial_dim,
+            self.spatial_dim,
+            bias=True,
+            gather_output=True,
+            quant_config=getattr(config, 'quant_config', None),
+            prefix=f"{prefix}.spatial_linear2",
+        )
+
+        self.spatial_norm = nn.LayerNorm(self.spatial_dim, eps=1e-6)
+
+        if self.use_temporal_conv:
+            self.temporal_linear1 = ColumnParallelLinear(
+                self.temporal_dim,
+                self.spatial_dim,
+                bias=True,
+                gather_output=True,
+                quant_config=getattr(config, 'quant_config', None),
+                prefix=f"{prefix}.temporal_linear1",
+            )
+
+            self.temporal_gelu = nn.GELU()
+
+            self.temporal_linear2 = ColumnParallelLinear(
+                self.spatial_dim,
+                self.spatial_dim,
+                bias=True,
+                gather_output=True,
+                quant_config=getattr(config, 'quant_config', None),
+                prefix=f"{prefix}.temporal_linear2",
+            )
+
+            self.temporal_norm = nn.LayerNorm(self.spatial_dim, eps=1e-6)
+
+        self.mlp = ColumnParallelLinear(
+            self.spatial_dim,
+            self.out_dim,
+            bias=True,
+            gather_output=True,
+            quant_config=getattr(config, 'quant_config', None),
+            prefix=f"{prefix}.mlp",
+        )
+
+        self.after_norm = RMSNorm(hidden_size=out_dim,
+                                  eps=getattr(config, 'rms_norm_eps', 1e-6))
+
+    def spatial_conv_reshape(self, x, spatial_conv_size):
+        S, C = x.shape
+        x = x.reshape([-1, C * (spatial_conv_size**2)])
+        return x
+
+    def forward(self, x, grid_thw):
+
+        def fwd_spatial(x):
+            x = self.spatial_conv_reshape(x, self.spatial_conv_size)
+
+            x, _ = self.spatial_linear1(x)
+            x = self.spatial_gelu(x)
+            x, _ = self.spatial_linear2(x)
+            x = self.spatial_norm(x)
+
+            return x
+
+        def fwd_placeholder(x, grid_thw, to_tensor=False):
+
+            grid_thw_cpu = grid_thw.cpu().numpy()
+            grid_t, grid_hw = grid_thw_cpu[:, 0], grid_thw_cpu[:, 1:]
+            grid_hw_after_conv = grid_hw.prod(-1) // (self.spatial_conv_size**
+                                                      2)
+
+            tokens_per_img_or_vid = grid_thw_cpu.prod(-1) // (
+                self.spatial_conv_size**2)
+            batch_offset = np.empty(tokens_per_img_or_vid.size,
+                                    dtype=tokens_per_img_or_vid.dtype)
+            batch_offset[0] = 0
+            batch_offset[1:] = tokens_per_img_or_vid.cumsum()[:-1]
+
+            slice_offsets = []
+            for temporoal_size, spatial_size, b_offset in zip(
+                    grid_t, grid_hw_after_conv, batch_offset):
+                for temp_offset in range(0, temporoal_size, 2):
+                    slice_offsets.append(
+                        np.arange(
+                            b_offset + (temp_offset) * spatial_size,
+                            b_offset + (temp_offset + 1) * spatial_size,
+                        ))
+            slice_offsets = torch.tensor(np.concatenate(slice_offsets,
+                                                        axis=-1)).to(x.device)
+
+            slice_offsets2 = []
+            for temporoal_size, spatial_size, b_offset in zip(
+                    grid_t, grid_hw_after_conv, batch_offset):
+                for temp_offset in range(1 if temporoal_size > 1 else 0,
+                                         temporoal_size, 2):
+                    slice_offsets2.append(
+                        np.arange(
+                            b_offset + (temp_offset) * spatial_size,
+                            b_offset + (temp_offset + 1) * spatial_size,
+                        ))
+            slice_offsets2 = torch.tensor(
+                np.concatenate(slice_offsets2, axis=-1)).to(x.device)
+
+            x_timestep_1 = torch.index_select(x, dim=0, index=slice_offsets)
+            x_timestep_2 = torch.index_select(x, dim=0, index=slice_offsets2)
+            x = torch.concat([x_timestep_1, x_timestep_2], dim=-1)
+            return x
+
+        def fwd_temporal(x):
+            x, _ = self.temporal_linear1(x)
+            x = self.temporal_gelu(x)
+            x, _ = self.temporal_linear2(x)
+            x = self.temporal_norm(x)
+            return x
+
+        def fwd_mlp(x):
+            x, _ = self.mlp(x)
+            x = self.after_norm(x)
+            return x
+
+        x = fwd_spatial(x)
+        if self.use_temporal_conv:
+            x = fwd_placeholder(x, grid_thw)
+            x = fwd_temporal(x)
+        x = fwd_mlp(x)
+        return x
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            if name not in params_dict:
+                continue
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class Ernie4_5_VLProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.model_config.hf_config
+
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(use_fast=True, **kwargs)
+
+    def get_image_processor(self, **kwargs: object):
+        return self.get_hf_processor(**kwargs).image_processor
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": None}
+
+    def _get_vision_info(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int = 1,
+        do_resize: bool = True,
+        image_processor: Optional[Any],
+    ) -> tuple[ImageSize, int]:
+        if image_processor is None:
+            image_processor = self.get_image_processor()
+        hf_config = self.get_hf_config()
+        vision_config = hf_config.vision_config
+
+        patch_size = vision_config.patch_size
+        spatial_conv_size = hf_config.spatial_conv_size
+        temporal_conv_size = hf_config.temporal_conv_size
+
+        if do_resize:
+            resized_height, resized_width = smart_resize(
+                height=image_height,
+                width=image_width,
+                factor=patch_size * spatial_conv_size,
+                min_pixels=image_processor.min_pixels,
+                max_pixels=image_processor.max_pixels,
+            )
+            preprocessed_size = ImageSize(width=resized_width,
+                                          height=resized_height)
+        else:
+            preprocessed_size = ImageSize(width=image_width,
+                                          height=image_height)
+
+        grid_t = max(num_frames // temporal_conv_size, 1)
+        grid_h = preprocessed_size.height // patch_size
+        grid_w = preprocessed_size.width // patch_size
+
+        num_patches = grid_t * grid_h * grid_w
+        num_vision_tokens = num_patches // (spatial_conv_size**2)
+
+        return preprocessed_size, num_vision_tokens
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        image_processor: Optional[Any],
+    ) -> int:
+        _, num_image_tokens = self._get_vision_info(
+            image_width=image_width,
+            image_height=image_height,
+            image_processor=image_processor,
+        )
+        return num_image_tokens
+
+    def get_num_video_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int,
+        image_processor: Optional[Any],
+    ) -> int:
+        _, num_video_tokens = self._get_vision_info(
+            image_width=image_width,
+            image_height=image_height,
+            num_frames=num_frames,
+            image_processor=image_processor,
+        )
+        return num_video_tokens
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        max_image_size, _ = self._get_vision_info(
+            image_width=9999999,
+            image_height=9999999,
+            image_processor=None,
+        )
+        return max_image_size
+
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        num_image_tokens = self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            image_processor=None,
+        )
+        return num_image_tokens
+
+    def _get_max_video_frames(self, max_tokens: int) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        num_frames = 0
+
+        while True:
+            next_num_frames = num_frames + 1
+            next_max_tokens = self.get_num_video_tokens(
+                image_width=target_width,
+                image_height=target_height,
+                num_frames=next_num_frames,
+                image_processor=None,
+            )
+
+            if next_max_tokens > max_tokens:
+                break
+
+            num_frames = next_num_frames
+
+        # If the number of frames is odd, discard one frame.
+        if num_frames % 2 != 0:
+            num_frames -= 1
+
+        return num_frames
+
+    def get_num_frames_with_most_features(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        max_images = mm_counts.get("image", 0)
+        max_videos = mm_counts.get("video", 0)
+
+        max_image_tokens = self.get_max_image_tokens() * max_images
+        max_total_frames = self._get_max_video_frames(seq_len -
+                                                      max_image_tokens)
+        max_frames_per_video = min(max_total_frames // max(max_videos, 1),
+                                   _MAX_FRAMES_PER_VIDEO)
+
+        return max(max_frames_per_video, 2)
+
+    def get_max_video_tokens(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        return self.get_num_video_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            num_frames=self.get_num_frames_with_most_features(
+                seq_len, mm_counts),
+            image_processor=None,
+        )
+
+
+class Ernie4_5VLMultiModalProcessor(
+        BaseMultiModalProcessor[Ernie4_5_VLProcessingInfo]):
+
+    def _pixel_values_norm(
+        self,
+        pixel_values: torch.Tensor,
+        mm_kwargs: object,
+    ) -> torch.Tensor:
+        hf_config = self.info.get_hf_config()
+        vision_config = hf_config.vision_config
+        image_processor = self.info.get_image_processor(**mm_kwargs)
+        image_mean_tensor = torch.tensor(image_processor.image_mean,
+                                         dtype=torch.float32).reshape(
+                                             [1, 3, 1, 1])
+        image_std_tensor = torch.tensor(image_processor.image_std,
+                                        dtype=torch.float32).reshape(
+                                            [1, 3, 1, 1])
+        rescale_factor = torch.tensor(image_processor.rescale_factor,
+                                      dtype=torch.float32)
+        patch_size_squared = vision_config.patch_size**2
+
+        image_mean_tensor = (image_mean_tensor.squeeze(
+            [-2, -1]).repeat_interleave(patch_size_squared, -1))
+        image_std_tensor = (image_std_tensor.squeeze(
+            [-2, -1]).repeat_interleave(patch_size_squared, -1))
+
+        if not image_mean_tensor.is_contiguous():
+            image_mean_tensor = image_mean_tensor.contiguous()
+        if not image_std_tensor.is_contiguous():
+            image_std_tensor = image_std_tensor.contiguous()
+
+        pixel_values = (rescale_factor * pixel_values.to(torch.float32) -
+                        image_mean_tensor) / image_std_tensor
+        pixel_values = pixel_values.to(hf_config.torch_dtype)
+        return pixel_values
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        # when the prompt is not empty but the multimodal data is empty,
+        # directly invoke the tokenizer.
+        if "images" not in mm_data and "videos" not in mm_data and prompt != "":
+            tokenizer = self.info.get_tokenizer()
+            prompt_ids = tokenizer.encode(prompt)
+            tokenizer_output = BatchFeature(dict(input_ids=[prompt_ids]),
+                                            tensor_type="pt")
+            return tokenizer_output
+
+        if "images" not in mm_data:
+            mm_data["images"] = []
+        if "videos" not in mm_data:
+            mm_data["videos"] = []
+        processor_output = self.info.ctx.call_hf_processor(
+            self.info.get_hf_processor(**mm_kwargs),
+            dict(text=[prompt],
+                 images=mm_data["images"],
+                 videos=mm_data["videos"]),
+            dict(**mm_kwargs, **tok_kwargs),
+        )
+
+        # Divide the processor_output into two modalities: image and video.
+        if processor_output is not None:
+            pixel_values = processor_output['images']
+            if pixel_values is not None:
+                processor_output['images'] = self._pixel_values_norm(
+                    pixel_values, mm_kwargs)
+            for key in list(processor_output.keys()):
+                if processor_output[key] is None:
+                    del processor_output[key]
+                    continue
+                if key == "grid_thw":
+                    grid_thw = processor_output['grid_thw']
+                    pixel_values_all = processor_output['images']
+                    # Identify elements where the first
+                    # dimension is greater than 1 and
+                    # treat them as the video modality
+                    mask = grid_thw[:, 0] > 1
+                    processor_output["video_grid_thw"] = grid_thw[mask]
+                    processor_output["image_grid_thw"] = grid_thw[~mask]
+                    image_patch_num = processor_output["image_grid_thw"].prod(
+                        dim=1).sum()
+                    processor_output[
+                        'pixel_values'] = pixel_values_all[:image_patch_num]
+                    processor_output['pixel_values_videos'] = pixel_values_all[
+                        image_patch_num:]
+                    del processor_output['images']
+
+        return processor_output
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargsItems,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+
+        before_placeholder = {
+            "image": "<|image@placeholder|>",
+            "video": "<|video@placeholder|>"
+        }
+
+        after_placeholder = {
+            # image and video have same placeholder
+            "image": "<|IMAGE_PLACEHOLDER|>",
+            "video": "<|IMAGE_PLACEHOLDER|>"
+        }
+
+        merge_length = hf_processor.spatial_conv_size**2
+
+        def get_replacement_ernie45vl(item_idx: int, modality: str):
+            out_item = out_mm_kwargs[modality][item_idx]
+            grid_thw = out_item[f"{modality}_grid_thw"].data
+            assert isinstance(grid_thw, torch.Tensor)
+            if modality == "video":
+                num_tokens = int(grid_thw.prod(
+                )) // hf_processor.temporal_conv_size // merge_length
+            else:
+                num_tokens = int(grid_thw.prod()) // merge_length
+            return after_placeholder[modality] * num_tokens
+
+        return [
+            PromptReplacement(
+                modality=modality,
+                target=before_placeholder[modality],
+                replacement=partial(get_replacement_ernie45vl,
+                                    modality=modality),
+            ) for modality in ("image", "video")
+        ]
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+
+        image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
+        image_grid_sizes = image_grid_thw.prod(-1)
+
+        video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
+        video_grid_sizes = video_grid_thw.prod(-1)
+
+        return dict(
+            pixel_values=MultiModalFieldConfig.flat_from_sizes(
+                "image", image_grid_sizes),
+            image_grid_thw=MultiModalFieldConfig.batched("image"),
+            pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
+                "video", video_grid_sizes),
+            video_grid_thw=MultiModalFieldConfig.batched("video"),
+        )
+
+
+class Ernie4_5_VLDummyInputsBuilder(
+        BaseDummyInputsBuilder[Ernie4_5_VLProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+        prompt = ""
+        for i in range(num_images):
+            prompt += (f"Picture {i+1}:"
+                       "<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>")
+
+        for i in range(num_videos):
+            prompt += (f"Video {i+1}:"
+                       "<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>")
+        return prompt
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        target_num_frames = \
+            self.info.get_num_frames_with_most_features(seq_len, mm_counts)
+
+        return {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images),
+            "video":
+            self._get_dummy_videos(width=target_width,
+                                   height=target_height,
+                                   num_frames=target_num_frames,
+                                   num_videos=num_videos)
+        }
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    Ernie4_5VLMultiModalProcessor,
+    info=Ernie4_5_VLProcessingInfo,
+    dummy_inputs=Ernie4_5_VLDummyInputsBuilder)
+class Ernie4_5_VLMoeForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                             SupportsLoRA, SupportsPP):
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # To ensure correct weight loading and mapping.
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "lm_head.": "language_model.lm_head.",
+            "model.": "language_model.model.",
+            # model.resampler_model.-> language_model.model.resampler_model.
+            # language_model.model.resampler_model. -> resampler_model.
+            "language_model.model.resampler_model.": "resampler_model.",
+        },
+        # resampler_weight_mappings
+        orig_to_new_substr={
+            "spatial_linear.0.": "spatial_linear1.",
+            "spatial_linear.2.": "spatial_linear2.",
+            "spatial_linear.3.": "spatial_norm.",
+            "temporal_linear.0.": "temporal_linear1.",
+            "temporal_linear.2.": "temporal_linear2.",
+            "temporal_linear.3.": "temporal_norm.",
+        })
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>"
+        if modality.startswith("video"):
+            return "<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>"
+
+        raise ValueError("Only image or video modality is supported")
+
+    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        self.vision_model = Ernie4_5_VisionTransformer(
+            config.vision_config,
+            norm_eps=getattr(config, "rms_norm_eps", 1e-6),
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "vision_model"),
+        )
+
+        self.language_model = Ernie4_5_VLMoeForCausalLM(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+
+        self.resampler_model = VariableResolutionResamplerModel(
+            self.config.pixel_hidden_size,
+            self.config.hidden_size,
+            self.config.spatial_conv_size,
+            self.config.temporal_conv_size,
+            config=self.config,
+            prefix=maybe_prefix(prefix, "resampler_model"))
+
+        self.visual_token_mask = None
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        """compute logits"""
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def _vision_forward(
+        self,
+        pixel_values: torch.Tensor,
+        grid_thw: torch.Tensor,
+    ) -> torch.Tensor:
+        if grid_thw is not None:
+            grid_thw = grid_thw[grid_thw > 0]
+            if grid_thw.numel() % 3 != 0:
+                raise ValueError(
+                    f"grid_thw has {grid_thw.numel()} elements after filtering,"
+                    "which is not divisible by 3.")
+            grid_thw = grid_thw.reshape(-1, 3)
+            # example: [[1,64,64],[2,80,80]] -> [[1,64,64],[1,80,80],[1,80,80]]
+            grid_thw = F.pad(
+                torch.repeat_interleave(grid_thw[:, 1:], grid_thw[:, 0], 0),
+                [1, 0, 0, 0],
+                value=1,
+            )
+        image_features = self.vision_model(pixel_values, grid_thw)
+        return image_features
+
+    def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None:
+        if getattr(self.config, "im_patch_id", None) is not None:
+            self.visual_token_mask = (
+                input_ids == self.config.im_patch_id).reshape(-1, 1)
+        else:
+            self.visual_token_mask = None
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def _validate_and_reshape_mm_tensor(self, mm_input: object,
+                                        name: str) -> torch.Tensor:
+        if not isinstance(mm_input, (torch.Tensor, list)):
+            raise ValueError(f"Incorrect type of {name}. "
+                             f"Got type: {type(mm_input)}")
+        if isinstance(mm_input, torch.Tensor):
+            if mm_input.ndim == 2:
+                return mm_input
+            if mm_input.ndim != 3:
+                raise ValueError(f"{name} should be 2D or batched 3D tensor. "
+                                 f"Got ndim: {mm_input.ndim} "
+                                 f"(shape={mm_input.shape})")
+            return torch.concat(list(mm_input))
+        else:
+            return torch.concat(mm_input)
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[Ernie4_5_VLImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_grid_thw = kwargs.pop("image_grid_thw", None)
+
+        if pixel_values is None:
+            return None
+
+        if pixel_values is not None:
+            pixel_values = self._validate_and_reshape_mm_tensor(
+                pixel_values, "image pixel values")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, "image grid_thw")
+
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            return Ernie4_5_VLImagePixelInputs(type="pixel_values",
+                                               pixel_values=pixel_values,
+                                               image_grid_thw=image_grid_thw)
+
+    def _parse_and_validate_video_input(
+            self, **kwargs: object) -> Optional[Ernie4_5_VLVideoInputs]:
+        pixel_values_videos = kwargs.pop("pixel_values_videos", None)
+        video_grid_thw = kwargs.pop("video_grid_thw", None)
+
+        if pixel_values_videos is None:
+            return None
+
+        if pixel_values_videos is not None:
+            pixel_values_videos = self._validate_and_reshape_mm_tensor(
+                pixel_values_videos, "video pixel values")
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, "video grid_thw")
+
+            return Ernie4_5_VLVideoPixelInputs(
+                type="pixel_values_videos",
+                pixel_values_videos=pixel_values_videos,
+                video_grid_thw=video_grid_thw,
+            )
+
+    def _process_image_input(
+            self,
+            image_input: Ernie4_5_VLImageInputs) -> tuple[torch.Tensor, ...]:
+
+        grid_thw = image_input["image_grid_thw"]
+        assert grid_thw.ndim == 2
+
+        pixel_values = image_input["pixel_values"].type(
+            self.vision_model.dtype)
+        image_features = self._vision_forward(pixel_values=pixel_values,
+                                              grid_thw=grid_thw)
+        image_embeds = self.resampler_model(image_features, grid_thw)
+
+        merge_size = self.vision_model.spatial_merge_size
+        sizes = grid_thw.prod(-1) // merge_size // merge_size
+
+        return image_embeds.split(sizes.tolist())
+
+    def _process_video_input(
+            self,
+            video_input: Ernie4_5_VLVideoInputs) -> tuple[torch.Tensor, ...]:
+
+        grid_thw = video_input["video_grid_thw"]
+        assert grid_thw.ndim == 2
+
+        pixel_values_videos = video_input["pixel_values_videos"].type(
+            self.vision_model.dtype)
+        video_features = self._vision_forward(pixel_values=pixel_values_videos,
+                                              grid_thw=grid_thw)
+        video_embeds = self.resampler_model(video_features, grid_thw)
+
+        merge_size = self.vision_model.spatial_merge_size
+        sizes = (grid_thw.prod(-1) //
+                 self.config.temporal_conv_size) // merge_size // merge_size
+
+        return video_embeds.split(sizes.tolist())
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        modalities = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("pixel_values",
+                             "image_embeds") and "images" not in modalities:
+                modalities["images"] = self._parse_and_validate_image_input(
+                    **kwargs)
+            if input_key in ("pixel_values_videos",
+                             "video_embeds") and "videos" not in modalities:
+                modalities["videos"] = self._parse_and_validate_video_input(
+                    **kwargs)
+
+        return modalities
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+
+        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not modalities:
+            return None
+
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in modalities:
+            if modality == "images":
+                image_input = modalities["images"]
+                vision_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += vision_embeddings
+            if modality == "videos":
+                video_input = modalities["videos"]
+                video_embeddings = self._process_video_input(video_input)
+                multimodal_embeddings += video_embeddings
+
+        return multimodal_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+
+        if multimodal_embeddings is None:
+            return inputs_embeds
+
+        self._set_visual_token_mask(input_ids)
+        inputs_embeds = merge_multimodal_embeddings(input_ids, inputs_embeds,
+                                                    multimodal_embeddings,
+                                                    [self.config.im_patch_id])
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+
+        forward_kwargs = {
+            "input_ids": input_ids,
+            "positions": positions,
+            "intermediate_tensors": intermediate_tensors,
+            "inputs_embeds": inputs_embeds,
+        }
+
+        if self.visual_token_mask is not None:
+
+            if self.visual_token_mask.shape[0] != inputs_embeds.shape[0]:
+                padding_len = inputs_embeds.shape[
+                    0] - self.visual_token_mask.shape[0]
+                # right pad False
+                pad = torch.zeros(
+                    (padding_len, self.visual_token_mask.shape[1]),
+                    dtype=self.visual_token_mask.dtype,
+                    device=self.visual_token_mask.device)
+                self.visual_token_mask = torch.cat(
+                    [self.visual_token_mask, pad], dim=0)
+
+            forward_kwargs.update(
+                {"visual_token_mask": self.visual_token_mask})
+            self.visual_token_mask = None
+
+        hidden_states = self.language_model.model(
+            **forward_kwargs,
+            **kwargs,
+        )
+
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py
new file mode 100644
index 0000000000000..f56c098435154
--- /dev/null
+++ b/vllm/model_executor/models/ernie45_vl_moe.py
@@ -0,0 +1,723 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2025 The Baidu team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Erine VL model compatible with HuggingFace weights."""
+from collections.abc import Iterable
+from typing import Any, Optional, Union
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention import Attention
+# from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding.ernie45_vl_rope import (
+    Ernie4_5_VLRotaryEmbedding)
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .ernie45_moe import Ernie4_5_MoeMLP
+from .interfaces import SupportsPP
+from .utils import (PPMissingLayer, extract_layer_index,
+                    is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+logger = init_logger(__name__)
+
+
+class Ernie4_5_VLMoeMLP(Ernie4_5_MoeMLP):
+    pass
+
+
+class Ernie4_5_VLMoeAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        head_dim: Optional[int] = None,
+        rope_theta: float = 500000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        freq_allocation: int = 20,
+        max_position_embeddings: int = 131072,
+        rms_norm_eps: float = 1e-05,
+        qkv_bias: bool = False,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        layer_idx = extract_layer_index(prefix) if len(prefix) > 0 else 0
+        self.layer_idx = layer_idx
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim or (hidden_size // self.total_num_heads)
+
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(hidden_size,
+                                          self.head_dim,
+                                          self.total_num_heads,
+                                          self.total_num_kv_heads,
+                                          bias=qkv_bias,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.qkv_proj")
+
+        self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
+                                        hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.o_proj")
+
+        t_rope = freq_allocation
+        h_rope = (self.head_dim // 2 - freq_allocation) // 2
+        w_rope = (self.head_dim // 2 - freq_allocation) // 2
+
+        self.rotary_emb = Ernie4_5_VLRotaryEmbedding(
+            head_size=self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position_embeddings=max_position_embeddings,
+            base=rope_theta,
+            is_neox_style=False,
+            dtype=torch.get_default_dtype(),
+            mrope_section=[h_rope, w_rope, t_rope])
+
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+
+        qkv, _ = self.qkv_proj(hidden_states)
+
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+
+        # Attention
+        attn_output = self.attn(q, k, v)
+        # Output projection
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Ernie4_5_VLMoeMoE(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        layer_idx = extract_layer_index(prefix)
+        self.layer_idx = layer_idx
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.has_shared_experts = (getattr(config, "moe_num_shared_experts", 0)
+                                   > 0)
+        self.hidden_size = config.hidden_size
+
+        moe_num_experts = config.moe_num_experts
+        max_moe_num_experts = max(moe_num_experts)
+
+        if self.tp_size > max_moe_num_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {moe_num_experts}.")
+
+        moe_layer_start_index = config.moe_layer_start_index
+        text_moe_layer_start_index = moe_layer_start_index[0]
+        vision_moe_layer_start_index = moe_layer_start_index[1]
+        moe_layer_end_index = config.moe_layer_end_index
+        moe_layer_end_index = getattr(
+            config, "moe_layer_end_index",
+            [config.num_hidden_layers - 1, config.num_hidden_layers - 1])
+        text_moe_layer_end_index = moe_layer_end_index[0]
+        vision_moe_layer_end_index = moe_layer_end_index[1]
+
+        assert config.moe_num_experts[0] == config.moe_num_experts[1]
+        self.e_score_correction_bias = nn.Parameter(
+            torch.empty(2, config.moe_num_experts[0]))
+
+        assert text_moe_layer_start_index <= text_moe_layer_end_index
+
+        if layer_idx >= text_moe_layer_start_index and \
+            layer_idx <= text_moe_layer_end_index:
+            self.text_experts_gate = ReplicatedLinear(
+                config.hidden_size,
+                config.moe_num_experts[0],
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.text_experts_gate")
+
+            self.text_experts = FusedMoE(
+                num_experts=config.moe_num_experts[0],
+                top_k=config.moe_k,
+                hidden_size=config.hidden_size,
+                intermediate_size=config.moe_intermediate_size[0],
+                reduce_results=False,
+                renormalize=True,
+                quant_config=quant_config,
+                e_score_correction_bias=self.e_score_correction_bias[0],
+                prefix=f"{prefix}.text_experts")
+        else:
+            self.text_experts = Ernie4_5_VLMoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                use_bias=getattr(config, 'use_bias', False),
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp")
+
+        assert vision_moe_layer_start_index <= vision_moe_layer_end_index
+        if layer_idx >= vision_moe_layer_start_index and \
+            layer_idx <= vision_moe_layer_end_index:
+            self.vision_experts_gate = ReplicatedLinear(
+                config.hidden_size,
+                config.moe_num_experts[1],
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.vision_experts_gate")
+
+            self.vision_experts = FusedMoE(
+                num_experts=config.moe_num_experts[1],
+                top_k=config.moe_k,
+                hidden_size=config.hidden_size,
+                intermediate_size=config.moe_intermediate_size[1],
+                reduce_results=False,
+                renormalize=True,
+                quant_config=quant_config,
+                e_score_correction_bias=self.e_score_correction_bias[1],
+                prefix=f"{prefix}.vision_experts")
+        else:
+            self.vision_experts = Ernie4_5_VLMoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                use_bias=getattr(config, 'use_bias', False),
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp")
+
+        if self.has_shared_experts:
+            intermediate_size = (config.moe_intermediate_size[0] *
+                                 config.moe_num_shared_experts)
+            self.shared_experts = Ernie4_5_VLMoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=f"{prefix}.shared_experts",
+                reduce_results=self.text_experts.
+                must_reduce_shared_expert_outputs())
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        visual_token_mask: torch.Tensor,
+        **kwargs: object,
+    ) -> torch.Tensor:
+
+        orig_shape = hidden_states.shape
+        hidden_dim = hidden_states.shape[-1]
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        if self.has_shared_experts:
+            shared_output = self.shared_experts(hidden_states)
+
+        if visual_token_mask is not None and visual_token_mask.any():
+            # assert visual_token_mask.shape[0] != hidden_states.shape[0]
+            visual_token_mask = visual_token_mask.repeat(
+                1, self.hidden_size).bool()
+            text_token_mask = ~visual_token_mask
+            final_hidden_states = torch.zeros_like(hidden_states)
+
+            text_hidden_states = hidden_states[text_token_mask].reshape(
+                -1, self.hidden_size)
+            vision_hidden_states = hidden_states[visual_token_mask].reshape(
+                -1, self.hidden_size)
+
+            text_router_logits, _ = self.text_experts_gate(text_hidden_states)
+            final_hidden_states[text_token_mask] = self.text_experts(
+                hidden_states=text_hidden_states,
+                router_logits=text_router_logits).flatten()
+
+            vision_router_logits, _ = self.vision_experts_gate(
+                vision_hidden_states)
+            final_hidden_states[visual_token_mask] = self.vision_experts(
+                hidden_states=vision_hidden_states,
+                router_logits=vision_router_logits).flatten()
+        else:
+            # text modal input processing directly
+            text_router_logits, _ = self.text_experts_gate(hidden_states)
+
+            final_hidden_states = self.text_experts(
+                hidden_states=hidden_states, router_logits=text_router_logits)
+
+        if self.has_shared_experts and \
+              shared_output is not None:
+            final_hidden_states = final_hidden_states + shared_output
+
+        if self.tp_size > 1:
+            final_hidden_states = (
+                self.text_experts.maybe_all_reduce_tensor_model_parallel(
+                    final_hidden_states))
+
+        return final_hidden_states.view(orig_shape)
+
+
+class Ernie4_5_VLMoeDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 500000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        freq_allocation = getattr(config, "freq_allocation", 20)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          131072)
+
+        self.self_attn = Ernie4_5_VLMoeAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            head_dim=getattr(config, 'head_dim', None),
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            freq_allocation=freq_allocation,
+            max_position_embeddings=max_position_embeddings,
+            rms_norm_eps=config.rms_norm_eps,
+            qkv_bias=getattr(config, 'use_bias', False),
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+
+        layer_idx = extract_layer_index(prefix)
+        self.layer_idx = layer_idx
+
+        # MoE
+        moe_layer_start_index = config.moe_layer_start_index
+        min_moe_layer_start_index = min(moe_layer_start_index)
+        moe_layer_end_index = getattr(
+            config, "moe_layer_end_index",
+            [config.num_hidden_layers - 1, config.num_hidden_layers - 1])
+        max_moe_layer_end_index = max(moe_layer_end_index)
+        assert min_moe_layer_start_index <= max_moe_layer_end_index
+        moe_num_experts = config.moe_num_experts
+        max_moe_num_experts = max(moe_num_experts)
+        moe_layer_interval = getattr(config, "moe_layer_interval", 1)
+        use_moe = getattr(config, "use_moe", max_moe_num_experts > 0)
+
+        if (use_moe and ((layer_idx + 1) % moe_layer_interval == 0)
+                and layer_idx >= min_moe_layer_start_index
+                and layer_idx <= max_moe_layer_end_index):
+            self.mlp = Ernie4_5_VLMoeMoE(config=config,
+                                         quant_config=quant_config,
+                                         prefix=f"{prefix}.mlp")
+        else:
+            self.mlp = Ernie4_5_VLMoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                use_bias=getattr(config, 'use_bias', False),
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp")
+
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        visual_token_mask: Optional[torch.Tensor],
+        **kwargs: object,
+    ) -> torch.Tensor:
+
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+
+        if isinstance(self.mlp, Ernie4_5_VLMoeMoE):
+            hidden_states = self.mlp(hidden_states, visual_token_mask,
+                                     **kwargs)
+        else:
+            hidden_states = self.mlp(hidden_states)
+
+        return hidden_states, residual
+
+
+# Since Ernie VL distinguishes between text experts and vision experts,
+# enabling torch.compile will cause errors.
+# @support_torch_compile(
+#     dynamic_arg_dims={
+#         "input_ids": 0,
+#         "positions": -1,
+#         "intermediate_tensors": 0,
+#         "inputs_embeds": 0,
+#         "visual_token_mask": 0,
+#     })
+class Ernie4_5_VLMoeModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.config = config
+
+        self.im_patch_id = config.im_patch_id
+
+        if get_pp_group().is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=f"{prefix}.embed_tokens")
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: Ernie4_5_VLMoeDecoderLayer(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix),
+            prefix=f"{prefix}.layers",
+        )
+
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        visual_token_mask: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states, residual = layer(positions, hidden_states, residual,
+                                            visual_token_mask, **kwargs)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+
+# only used as text backbone for ernie4.5-vl
+class Ernie4_5_VLMoeForCausalLM(nn.Module, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    fall_back_to_pt_during_load = False
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Ernie4_5_VLMoeModel(vllm_config=vllm_config,
+                                         prefix=maybe_prefix(prefix, "model"))
+
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(config.vocab_size,
+                                          config.hidden_size,
+                                          quant_config=quant_config)
+        else:
+            self.lm_head = PPMissingLayer()
+
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds, **kwargs)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=max(self.config.moe_num_experts))
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if self.config.tie_word_embeddings and name.endswith(
+                    "lm_head.weight"):
+                loaded_params.add("lm_head.weight")
+                continue
+            # MTP will be supported soon.
+            if "mtp" in name or \
+               "vision_model" in name or \
+               "resampler_model" in name:
+                continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+
+                if (("mlp.experts." in name) and name not in params_dict):
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if ((name.endswith(".bias") or name.endswith("_bias"))
+                        and name not in params_dict):
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Distinguish between vision experts and text experts
+                if "mlp.experts" in name:
+                    moe_offset = int(name.split(".")[-3])
+                    vision_expert_start_idx = self.config.moe_num_experts[0]
+                    is_text_expert = \
+                        moe_offset <= vision_expert_start_idx - 1
+                    if is_text_expert:
+                        name = name.replace(".experts.", ".text_experts.")
+                    else:
+                        name = name.replace(
+                            f".experts.{moe_offset}",
+                            f".vision_experts.{moe_offset-vision_expert_start_idx}"
+                        )
+
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+
+                    if weight_name not in name:
+                        continue
+
+                    # Distinguish between vision experts and text experts
+                    moe_offset = int(name.split(".")[-3])
+                    is_text_expert = \
+                        moe_offset <= self.config.moe_num_experts[0] - 1
+
+                    name = name.replace(weight_name, param_name)
+                    if is_text_expert:
+                        name = name.replace(".experts.", ".text_experts.")
+                    else:
+                        name = name.replace(".experts.", ".vision_experts.")
+
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    # Skip loading extra bias for GPTQ models.
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    param = params_dict[name]
+
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Distinguish between vision expert gate
+                    # and text expert gate
+                    if name.endswith("mlp.gate.weight"):
+                        name = name.replace("gate.weight",
+                                            "text_experts_gate.weight")
+                        loaded_weight = loaded_weight.T
+                    elif name.endswith("mlp.gate.weight_1"):
+                        name = name.replace("gate.weight_1",
+                                            "vision_experts_gate.weight")
+                        loaded_weight = loaded_weight.T
+
+                    if "e_score_correction_bias" in name:
+                        name = name.replace(".moe_statics.", ".")
+
+                    # Skip loading extra bias for GPTQ models.
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    param = params_dict[name]
+
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index ebf78771e40a4..c65c58d4a047f 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -206,6 +206,7 @@ _MULTIMODAL_MODELS = {
     "ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"),  # noqa: E501
     "Cohere2VisionForConditionalGeneration": ("cohere2_vision", "Cohere2VisionForConditionalGeneration"),  # noqa: E501
     "DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"),
+    "Ernie4_5_VLMoeForConditionalGeneration": ("ernie45_vl", "Ernie4_5_VLMoeForConditionalGeneration"),  # noqa: E501
     "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
     "Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"),  # noqa: E501
     "Gemma3nForConditionalGeneration": ("gemma3n_mm", "Gemma3nForConditionalGeneration"),    # noqa: E501

From 32102644213a6367d10ec3a92ae76fb0004f3a52 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Tue, 26 Aug 2025 21:58:59 -0700
Subject: [PATCH 067/112] [Frontend] Add --log-error-stack to print stack trace
 for error response (#22960)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 vllm/entrypoints/openai/api_server.py             | 10 ++++++++++
 vllm/entrypoints/openai/cli_args.py               |  2 ++
 vllm/entrypoints/openai/serving_chat.py           |  4 +++-
 vllm/entrypoints/openai/serving_classification.py |  2 ++
 vllm/entrypoints/openai/serving_completion.py     |  2 ++
 vllm/entrypoints/openai/serving_embedding.py      |  4 +++-
 vllm/entrypoints/openai/serving_engine.py         |  9 +++++++++
 vllm/entrypoints/openai/serving_pooling.py        |  4 +++-
 vllm/entrypoints/openai/serving_responses.py      |  2 ++
 vllm/entrypoints/openai/serving_score.py          |  4 +++-
 vllm/entrypoints/openai/serving_tokenization.py   |  4 +++-
 vllm/entrypoints/openai/serving_transcription.py  |  8 ++++++--
 vllm/entrypoints/openai/speech_to_text.py         |  4 +++-
 13 files changed, 51 insertions(+), 8 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index db02767fdfd71..9a2470649c8d2 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1749,6 +1749,7 @@ async def init_app_state(
         enable_prompt_tokens_details=args.enable_prompt_tokens_details,
         enable_force_include_usage=args.enable_force_include_usage,
         enable_log_outputs=args.enable_log_outputs,
+        log_error_stack=args.log_error_stack,
     ) if "generate" in supported_tasks else None
     state.openai_serving_chat = OpenAIServingChat(
         engine_client,
@@ -1767,6 +1768,7 @@ async def init_app_state(
         enable_prompt_tokens_details=args.enable_prompt_tokens_details,
         enable_force_include_usage=args.enable_force_include_usage,
         enable_log_outputs=args.enable_log_outputs,
+        log_error_stack=args.log_error_stack,
     ) if "generate" in supported_tasks else None
     state.openai_serving_completion = OpenAIServingCompletion(
         engine_client,
@@ -1776,6 +1778,7 @@ async def init_app_state(
         return_tokens_as_token_ids=args.return_tokens_as_token_ids,
         enable_prompt_tokens_details=args.enable_prompt_tokens_details,
         enable_force_include_usage=args.enable_force_include_usage,
+        log_error_stack=args.log_error_stack,
     ) if "generate" in supported_tasks else None
     state.openai_serving_pooling = OpenAIServingPooling(
         engine_client,
@@ -1784,6 +1787,7 @@ async def init_app_state(
         request_logger=request_logger,
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
+        log_error_stack=args.log_error_stack,
     ) if "encode" in supported_tasks else None
     state.openai_serving_embedding = OpenAIServingEmbedding(
         engine_client,
@@ -1792,12 +1796,14 @@ async def init_app_state(
         request_logger=request_logger,
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
+        log_error_stack=args.log_error_stack,
     ) if "embed" in supported_tasks else None
     state.openai_serving_classification = ServingClassification(
         engine_client,
         model_config,
         state.openai_serving_models,
         request_logger=request_logger,
+        log_error_stack=args.log_error_stack,
     ) if "classify" in supported_tasks else None
 
     enable_serving_reranking = ("classify" in supported_tasks and getattr(
@@ -1807,6 +1813,7 @@ async def init_app_state(
         model_config,
         state.openai_serving_models,
         request_logger=request_logger,
+        log_error_stack=args.log_error_stack,
     ) if ("embed" in supported_tasks or enable_serving_reranking) else None
 
     state.openai_serving_tokenization = OpenAIServingTokenization(
@@ -1816,18 +1823,21 @@ async def init_app_state(
         request_logger=request_logger,
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
+        log_error_stack=args.log_error_stack,
     )
     state.openai_serving_transcription = OpenAIServingTranscription(
         engine_client,
         model_config,
         state.openai_serving_models,
         request_logger=request_logger,
+        log_error_stack=args.log_error_stack,
     ) if "transcription" in supported_tasks else None
     state.openai_serving_translation = OpenAIServingTranslation(
         engine_client,
         model_config,
         state.openai_serving_models,
         request_logger=request_logger,
+        log_error_stack=args.log_error_stack,
     ) if "transcription" in supported_tasks else None
 
     state.enable_server_load_tracking = args.enable_server_load_tracking
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 6e4eff5c80243..d0b5d013eb9e5 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -180,6 +180,8 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
     h11_max_header_count: int = H11_MAX_HEADER_COUNT_DEFAULT
     """Maximum number of HTTP headers allowed in a request for h11 parser.
     Helps mitigate header abuse. Default: 256."""
+    log_error_stack: bool = envs.VLLM_SERVER_DEV_MODE
+    """If set to True, log the stack trace of error responses"""
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 7e0e627780970..1c0ffdfb91897 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -76,13 +76,15 @@ class OpenAIServingChat(OpenAIServing):
         enable_prompt_tokens_details: bool = False,
         enable_force_include_usage: bool = False,
         enable_log_outputs: bool = False,
+        log_error_stack: bool = False,
     ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
                          models=models,
                          request_logger=request_logger,
                          return_tokens_as_token_ids=return_tokens_as_token_ids,
-                         enable_force_include_usage=enable_force_include_usage)
+                         enable_force_include_usage=enable_force_include_usage,
+                         log_error_stack=log_error_stack)
 
         self.response_role = response_role
         self.chat_template = chat_template
diff --git a/vllm/entrypoints/openai/serving_classification.py b/vllm/entrypoints/openai/serving_classification.py
index 377f7f6847179..1d510d0b60a2d 100644
--- a/vllm/entrypoints/openai/serving_classification.py
+++ b/vllm/entrypoints/openai/serving_classification.py
@@ -129,12 +129,14 @@ class ServingClassification(ClassificationMixin):
         models: OpenAIServingModels,
         *,
         request_logger: Optional[RequestLogger],
+        log_error_stack: bool = False,
     ) -> None:
         super().__init__(
             engine_client=engine_client,
             model_config=model_config,
             models=models,
             request_logger=request_logger,
+            log_error_stack=log_error_stack,
         )
 
     async def create_classify(
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index a0ce654094039..b81fd63ece7a4 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -59,6 +59,7 @@ class OpenAIServingCompletion(OpenAIServing):
         return_tokens_as_token_ids: bool = False,
         enable_prompt_tokens_details: bool = False,
         enable_force_include_usage: bool = False,
+        log_error_stack: bool = False,
     ):
         super().__init__(
             engine_client=engine_client,
@@ -67,6 +68,7 @@ class OpenAIServingCompletion(OpenAIServing):
             request_logger=request_logger,
             return_tokens_as_token_ids=return_tokens_as_token_ids,
             enable_force_include_usage=enable_force_include_usage,
+            log_error_stack=log_error_stack,
         )
         self.enable_prompt_tokens_details = enable_prompt_tokens_details
         self.default_sampling_params = (
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 9dcad8e391c68..45c1932f1873c 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -593,11 +593,13 @@ class OpenAIServingEmbedding(EmbeddingMixin):
         request_logger: Optional[RequestLogger],
         chat_template: Optional[str],
         chat_template_content_format: ChatTemplateContentFormatOption,
+        log_error_stack: bool = False,
     ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
                          models=models,
-                         request_logger=request_logger)
+                         request_logger=request_logger,
+                         log_error_stack=log_error_stack)
 
         self.chat_template = chat_template
         self.chat_template_content_format: Final = chat_template_content_format
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 0f4a7c0186b65..a97935e109ef2 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -5,6 +5,7 @@ import io
 import json
 import sys
 import time
+import traceback
 from collections.abc import AsyncGenerator, Iterable, Mapping, Sequence
 from concurrent.futures import ThreadPoolExecutor
 from http import HTTPStatus
@@ -205,6 +206,7 @@ class OpenAIServing:
         request_logger: Optional[RequestLogger],
         return_tokens_as_token_ids: bool = False,
         enable_force_include_usage: bool = False,
+        log_error_stack: bool = False,
     ):
         super().__init__()
 
@@ -222,6 +224,7 @@ class OpenAIServing:
 
         self._async_tokenizer_pool: dict[AnyTokenizer,
                                          AsyncMicrobatchTokenizer] = {}
+        self.log_error_stack = log_error_stack
 
     def _get_async_tokenizer(self, tokenizer) -> AsyncMicrobatchTokenizer:
         """
@@ -412,6 +415,12 @@ class OpenAIServing:
             message: str,
             err_type: str = "BadRequestError",
             status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> ErrorResponse:
+        if self.log_error_stack:
+            exc_type, _, _ = sys.exc_info()
+            if exc_type is not None:
+                traceback.print_exc()
+            else:
+                traceback.print_stack()
         return ErrorResponse(error=ErrorInfo(
             message=message, type=err_type, code=status_code.value))
 
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
index 38745d001ade6..e8cb1aed84596 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -58,11 +58,13 @@ class OpenAIServingPooling(OpenAIServing):
         request_logger: Optional[RequestLogger],
         chat_template: Optional[str],
         chat_template_content_format: ChatTemplateContentFormatOption,
+        log_error_stack: bool = False,
     ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
                          models=models,
-                         request_logger=request_logger)
+                         request_logger=request_logger,
+                         log_error_stack=log_error_stack)
 
         self.chat_template = chat_template
         self.chat_template_content_format: Final = chat_template_content_format
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 67eec2d523e3f..899cb07b2b37d 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -88,6 +88,7 @@ class OpenAIServingResponses(OpenAIServing):
         enable_prompt_tokens_details: bool = False,
         enable_force_include_usage: bool = False,
         enable_log_outputs: bool = False,
+        log_error_stack: bool = False,
     ) -> None:
         super().__init__(
             engine_client=engine_client,
@@ -96,6 +97,7 @@ class OpenAIServingResponses(OpenAIServing):
             request_logger=request_logger,
             return_tokens_as_token_ids=return_tokens_as_token_ids,
             enable_force_include_usage=enable_force_include_usage,
+            log_error_stack=log_error_stack,
         )
 
         self.chat_template = chat_template
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index c246274514dbf..37838e22a4002 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -47,11 +47,13 @@ class ServingScores(OpenAIServing):
         models: OpenAIServingModels,
         *,
         request_logger: Optional[RequestLogger],
+        log_error_stack: bool = False,
     ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
                          models=models,
-                         request_logger=request_logger)
+                         request_logger=request_logger,
+                         log_error_stack=log_error_stack)
 
     async def _embedding_score(
         self,
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index 58d720474768b..2f258255d5f16 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -39,11 +39,13 @@ class OpenAIServingTokenization(OpenAIServing):
         request_logger: Optional[RequestLogger],
         chat_template: Optional[str],
         chat_template_content_format: ChatTemplateContentFormatOption,
+        log_error_stack: bool = False,
     ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
                          models=models,
-                         request_logger=request_logger)
+                         request_logger=request_logger,
+                         log_error_stack=log_error_stack)
 
         self.chat_template = chat_template
         self.chat_template_content_format: Final = chat_template_content_format
diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py
index 0d6989fe91bfa..9ba58d4425221 100644
--- a/vllm/entrypoints/openai/serving_transcription.py
+++ b/vllm/entrypoints/openai/serving_transcription.py
@@ -32,13 +32,15 @@ class OpenAIServingTranscription(OpenAISpeechToText):
         *,
         request_logger: Optional[RequestLogger],
         return_tokens_as_token_ids: bool = False,
+        log_error_stack: bool = False,
     ):
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
                          models=models,
                          request_logger=request_logger,
                          return_tokens_as_token_ids=return_tokens_as_token_ids,
-                         task_type="transcribe")
+                         task_type="transcribe",
+                         log_error_stack=log_error_stack)
 
     async def create_transcription(
         self, audio_data: bytes, request: TranscriptionRequest,
@@ -88,13 +90,15 @@ class OpenAIServingTranslation(OpenAISpeechToText):
         *,
         request_logger: Optional[RequestLogger],
         return_tokens_as_token_ids: bool = False,
+        log_error_stack: bool = False,
     ):
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
                          models=models,
                          request_logger=request_logger,
                          return_tokens_as_token_ids=return_tokens_as_token_ids,
-                         task_type="translate")
+                         task_type="translate",
+                         log_error_stack=log_error_stack)
 
     async def create_translation(
         self, audio_data: bytes, request: TranslationRequest,
diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
index de2619a78f8e0..1cbd7dba393f6 100644
--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@@ -53,12 +53,14 @@ class OpenAISpeechToText(OpenAIServing):
         request_logger: Optional[RequestLogger],
         return_tokens_as_token_ids: bool = False,
         task_type: Literal["transcribe", "translate"] = "transcribe",
+        log_error_stack: bool = False,
     ):
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
                          models=models,
                          request_logger=request_logger,
-                         return_tokens_as_token_ids=return_tokens_as_token_ids)
+                         return_tokens_as_token_ids=return_tokens_as_token_ids,
+                         log_error_stack=log_error_stack)
 
         self.default_sampling_params = (
             self.model_config.get_diff_sampling_param())

From 142ac0803045b3a3edcd7aa58fe079872903a30c Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Tue, 26 Aug 2025 21:59:14 -0700
Subject: [PATCH 068/112] [Frontend] Optimize beam search performance by
 limiting concurrency (#23599)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 benchmarks/benchmark_throughput.py |   1 -
 tests/conftest.py                  |   8 +-
 tests/samplers/test_beam_search.py |  53 +++++++++++
 vllm/entrypoints/llm.py            | 138 ++++++++++++++++-------------
 4 files changed, 136 insertions(+), 64 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index c7f290e1eb88e..6b24b8c8f3c67 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -96,7 +96,6 @@ def run_vllm(
         end = time.perf_counter()
     else:
         assert lora_requests is None, "BeamSearch API does not support LoRA"
-        prompts = [request.prompt for request in requests]
         # output_len should be the same for all requests.
         output_len = requests[0].expected_output_len
         for request in requests:
diff --git a/tests/conftest.py b/tests/conftest.py
index 2bf88abb0f6c2..f8bfdfc8e6259 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1022,15 +1022,17 @@ class VllmRunner:
         images: Optional[PromptImageInput] = None,
         videos: Optional[PromptVideoInput] = None,
         audios: Optional[PromptAudioInput] = None,
+        concurrency_limit: Optional[int] = None,
     ) -> list[tuple[list[list[int]], list[str]]]:
         inputs = self.get_inputs(prompts,
                                  images=images,
                                  videos=videos,
                                  audios=audios)
 
-        outputs = self.llm.beam_search(
-            inputs,
-            BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))
+        outputs = self.llm.beam_search(inputs,
+                                       BeamSearchParams(beam_width=beam_width,
+                                                        max_tokens=max_tokens),
+                                       concurrency_limit=concurrency_limit)
         returned_outputs = []
         for output in outputs:
             token_ids = [x.tokens for x in output.sequences]
diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py
index bdf48c7687b25..cc9a88a255f9f 100644
--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -67,6 +67,59 @@ def test_beam_search_single_input(
                 f"vLLM: {vllm_output_ids}")
 
 
+@pytest.mark.skip_v1  # FIXME: This fails on V1 right now.
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", MAX_TOKENS)
+@pytest.mark.parametrize("beam_width", BEAM_WIDTHS)
+def test_beam_search_with_concurrency_limit(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    beam_width: int,
+) -> None:
+    # example_prompts[1]&[3]&[7] fails due to unknown reason even without
+    # concurency limit. skip them for now.
+    example_prompts = (example_prompts[:8])
+    concurrency_limit = 2
+    assert len(example_prompts) > concurrency_limit
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        outputs_with_limit = vllm_model.generate_beam_search(
+            example_prompts,
+            beam_width,
+            max_tokens,
+            concurrency_limit=concurrency_limit)
+        outputs_without_limit = []
+
+        for i in range(0, len(example_prompts), concurrency_limit):
+            outputs_without_limit.extend(
+                vllm_model.generate_beam_search(
+                    example_prompts[i:i + concurrency_limit], beam_width,
+                    max_tokens))
+
+    correct = True
+    for i in range(len(example_prompts)):
+        output_ids_with_limit, output_texts_with_limit = outputs_with_limit[i]
+        output_ids_without_limit, output_texts_without_limit = (
+            outputs_without_limit[i])
+        for j, (text_with_limit, text_without_limit) in enumerate(
+                zip(output_texts_with_limit, output_texts_without_limit)):
+            print(f">>>{j}-th with limit output:")
+            print(text_with_limit)
+            print(f">>>{j}-th without limit output:")
+            print(text_without_limit)
+        assert len(output_ids_with_limit) == len(output_ids_without_limit)
+        for j in range(len(output_ids_with_limit)):
+            if output_ids_with_limit[j] != output_ids_without_limit[j]:
+                print(f"Test{i} output{j}:\n+limit: {output_ids_with_limit}\n"
+                      f"-limit: {output_ids_without_limit}")
+                correct = False
+    assert correct
+
+
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", MAX_TOKENS)
 @pytest.mark.parametrize("beam_width", MM_BEAM_WIDTHS)
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 8816ff56d6840..72b6123670b70 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -523,6 +523,7 @@ class LLM:
         params: BeamSearchParams,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         use_tqdm: bool = False,
+        concurrency_limit: Optional[int] = None,
     ) -> list[BeamSearchOutput]:
         """
         Generate sequences using beam search.
@@ -533,6 +534,8 @@ class LLM:
             params: The beam search parameters.
             lora_request: LoRA request to use for generation, if any.
             use_tqdm: Whether to use tqdm to display the progress bar.
+            concurrency_limit: The maximum number of concurrent requests.
+                If None, the number of concurrent requests is unlimited.
         """
         # TODO: how does beam search work together with length penalty,
         # frequency, penalty, and stopping criteria, etc.?
@@ -551,6 +554,15 @@ class LLM:
             length_penalty,
         )
 
+        if use_tqdm and concurrency_limit is not None:
+            logger.warning(
+                "Progress bar is not supported when using concurrency_limit. "
+                "Disabling progress bar.")
+            use_tqdm = False
+
+        if concurrency_limit is None:
+            concurrency_limit = len(prompts)
+
         def create_tokens_prompt_from_beam(
                 beam: BeamSearchSequence) -> TokensPrompt:
             token_prompt_kwargs: TokensPrompt = {
@@ -595,73 +607,79 @@ class LLM:
                     **mm_kwargs,
                 ), )
 
-        token_iter = range(max_tokens)
-        if use_tqdm:
-            token_iter = tqdm(token_iter,
-                              desc="Beam search",
-                              unit="token",
-                              unit_scale=False)
-            logger.warning(
-                "The progress bar shows the upper bound on token steps and "
-                "may finish early due to stopping conditions. It does not "
-                "reflect instance-level progress.")
+        for prompt_start in range(0, len(prompts), concurrency_limit):
+            instances_batch = instances[prompt_start:prompt_start +
+                                        concurrency_limit]
 
-        for _ in token_iter:
-            all_beams: list[BeamSearchSequence] = list(
-                sum((instance.beams for instance in instances), []))
-            pos = [0] + list(
-                itertools.accumulate(
-                    len(instance.beams) for instance in instances))
-            instance_start_and_end: list[tuple[int, int]] = list(
-                zip(pos[:-1], pos[1:]))
+            token_iter = range(max_tokens)
+            if use_tqdm:
+                token_iter = tqdm(token_iter,
+                                  desc="Beam search",
+                                  unit="token",
+                                  unit_scale=False)
+                logger.warning(
+                    "The progress bar shows the upper bound on token steps and "
+                    "may finish early due to stopping conditions. It does not "
+                    "reflect instance-level progress.")
+            for _ in token_iter:
+                all_beams: list[BeamSearchSequence] = list(
+                    sum((instance.beams for instance in instances_batch), []))
+                pos = [0] + list(
+                    itertools.accumulate(
+                        len(instance.beams) for instance in instances_batch))
+                instance_start_and_end: list[tuple[int, int]] = list(
+                    zip(pos[:-1], pos[1:]))
 
-            if len(all_beams) == 0:
-                break
+                if len(all_beams) == 0:
+                    break
 
-            # create the corresponding batch entries for prompt & optional lora
-            prompts_batch, lora_req_batch = zip(
-                *[(create_tokens_prompt_from_beam(beam), beam.lora_request)
-                  for beam in all_beams])
+                # create corresponding batch entries for prompt & optional lora
+                prompts_batch, lora_req_batch = zip(
+                    *[(create_tokens_prompt_from_beam(beam), beam.lora_request)
+                      for beam in all_beams])
 
-            # only runs for one step
-            # we don't need to use tqdm here
-            output = self.generate(prompts_batch,
-                                   sampling_params=beam_search_params,
-                                   use_tqdm=False,
-                                   lora_request=lora_req_batch)
+                # only runs for one step
+                # we don't need to use tqdm here
+                output = self.generate(prompts_batch,
+                                       sampling_params=beam_search_params,
+                                       use_tqdm=False,
+                                       lora_request=lora_req_batch)
 
-            for (start, end), instance in zip(instance_start_and_end,
-                                              instances):
-                instance_new_beams = []
-                for i in range(start, end):
-                    current_beam = all_beams[i]
-                    result = output[i]
+                for (start, end), instance in zip(instance_start_and_end,
+                                                  instances_batch):
+                    instance_new_beams = []
+                    for i in range(start, end):
+                        current_beam = all_beams[i]
+                        result = output[i]
 
-                    if result.outputs[0].logprobs is not None:
-                        # if `result.outputs[0].logprobs` is None, it means
-                        # the sequence is completed because of the max-model-len
-                        # or abortion. we don't need to add it to the new beams.
-                        logprobs = result.outputs[0].logprobs[0]
-                        for token_id, logprob_obj in logprobs.items():
-                            new_beam = BeamSearchSequence(
-                                tokens=current_beam.tokens + [token_id],
-                                logprobs=current_beam.logprobs + [logprobs],
-                                lora_request=current_beam.lora_request,
-                                cum_logprob=current_beam.cum_logprob +
-                                logprob_obj.logprob,
-                                multi_modal_data=current_beam.multi_modal_data,
-                                mm_processor_kwargs=current_beam.
-                                mm_processor_kwargs)
+                        if result.outputs[0].logprobs is not None:
+                            # if `result.outputs[0].logprobs` is None, it means
+                            # the sequence is completed because of the
+                            # max-model-len or abortion. we don't need to add
+                            # it to the new beams.
+                            logprobs = result.outputs[0].logprobs[0]
+                            for token_id, logprob_obj in logprobs.items():
+                                new_beam = BeamSearchSequence(
+                                    tokens=current_beam.tokens + [token_id],
+                                    logprobs=current_beam.logprobs +
+                                    [logprobs],
+                                    lora_request=current_beam.lora_request,
+                                    cum_logprob=current_beam.cum_logprob +
+                                    logprob_obj.logprob,
+                                    multi_modal_data=current_beam.
+                                    multi_modal_data,
+                                    mm_processor_kwargs=current_beam.
+                                    mm_processor_kwargs)
 
-                            if token_id == tokenizer.eos_token_id and \
-                                not ignore_eos:
-                                instance.completed.append(new_beam)
-                            else:
-                                instance_new_beams.append(new_beam)
-                sorted_beams = sorted(instance_new_beams,
-                                      key=sort_beams_key,
-                                      reverse=True)
-                instance.beams = sorted_beams[:beam_width]
+                                if token_id == tokenizer.eos_token_id and \
+                                    not ignore_eos:
+                                    instance.completed.append(new_beam)
+                                else:
+                                    instance_new_beams.append(new_beam)
+                    sorted_beams = sorted(instance_new_beams,
+                                          key=sort_beams_key,
+                                          reverse=True)
+                    instance.beams = sorted_beams[:beam_width]
 
         outputs = []
         for instance in instances:

From d272415e57c95da63c798c22c7d87cc5c0cda21f Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Wed, 27 Aug 2025 01:00:21 -0400
Subject: [PATCH 069/112] [Quantization] Expand compressed-tensors MoE matching
 logic to support NFP4 + FP8 MoEs (#22674)

Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com>
Signed-off-by: Dipika <dipikasikka1@gmail.com>
---
 .../compressed_tensors/compressed_tensors.py  | 13 +++----
 .../compressed_tensors_moe.py                 | 36 +++++++++++++++++--
 2 files changed, 40 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index ce74375aab426..245cf122ebab1 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -425,6 +425,10 @@ class CompressedTensorsConfig(QuantizationConfig):
             weight_quant: BaseModel,
             input_quant: BaseModel,
             format: Optional[str] = None) -> "CompressedTensorsScheme":
+
+        # use the per-layer format if defined, otherwise, use global format
+        format = format if format is not None else self.quant_format
+
         # Detect If Mixed Precision
         if self._is_fp4a16_nvfp4(weight_quant, input_quant):
             return CompressedTensorsW4A16Fp4()
@@ -437,14 +441,14 @@ class CompressedTensorsConfig(QuantizationConfig):
                                             actorder=weight_quant.actorder)
 
         if self._is_wNa16_group_channel(weight_quant, input_quant):
-            if (self.quant_format == CompressionFormat.marlin_24.value
+            if (format == CompressionFormat.marlin_24.value
                     and weight_quant.num_bits in W4A16SPARSE24_SUPPORTED_BITS):
                 assert weight_quant.symmetric
                 return CompressedTensorsW4A16Sparse24(
                     strategy=weight_quant.strategy,
                     num_bits=weight_quant.num_bits,
                     group_size=weight_quant.group_size)
-            if (self.quant_format == CompressionFormat.pack_quantized.value
+            if (format == CompressionFormat.pack_quantized.value
                     and weight_quant.num_bits in WNA16_SUPPORTED_BITS):
                 return CompressedTensorsWNA16(
                     num_bits=weight_quant.num_bits,
@@ -453,10 +457,7 @@ class CompressedTensorsConfig(QuantizationConfig):
                     group_size=weight_quant.group_size,
                     actorder=weight_quant.actorder)
 
-        act_quant_format = is_activation_quantization_format(
-            format
-        ) if format is not None else is_activation_quantization_format(
-            self.quant_format)
+        act_quant_format = is_activation_quantization_format(format)
         if act_quant_format:
             if self._is_fp4a4_nvfp4(weight_quant, input_quant):
                 if cutlass_fp4_supported(
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 1ee3478aa4f43..6279bb8b60570 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -22,6 +22,8 @@ from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
     is_valid_flashinfer_cutlass_fused_moe)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16 import (  # noqa
     WNA16_SUPPORTED_BITS, WNA16_SUPPORTED_TYPES_MAP)
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    find_matched_target)
 from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
     build_flashinfer_fp4_cutlass_moe_prepare_finalize, reorder_w1w3_to_w3w1,
@@ -65,12 +67,40 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
     @staticmethod
     def get_moe_method(
         quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
-        layer: torch.nn.Module,
+        layer: torch.nn.Module
     ) -> "CompressedTensorsMoEMethod":
         # TODO: @dsikka: refactor this to use schemes as other kernels
         # are supported + check if the layer is being ignored.
-        weight_quant = quant_config.target_scheme_map["Linear"].get("weights")
-        input_quant = quant_config.target_scheme_map["Linear"].get(
+        # Check if a using "Linear" to select scheems
+        if "Linear" in quant_config.target_scheme_map:
+            matched_target = "Linear"
+        else:
+            # May have instead defined the linear layers in the fused model
+
+            fused_layers = [
+                "re:.*down_proj.*", "re:.*gate_proj.*", "re:.*up_proj.*"
+            ]
+            current_scheme = None
+            for fused_layer in fused_layers:
+                # Check if one of the fused layers are defined in quant_config
+                matched_target = find_matched_target(
+                    layer_name=fused_layer,
+                    module=layer,
+                    targets=quant_config.target_scheme_map.keys(),
+                    fused_mapping=quant_config.packed_modules_mapping)
+
+                # Only valid if down_proj, gate_proj, and up_proj
+                # are mapped to the same quant scheme in the quant_config
+                if current_scheme is None:
+                    current_scheme = quant_config.target_scheme_map.get(
+                        matched_target)
+                else:
+                    assert current_scheme == quant_config.target_scheme_map.get(
+                        matched_target)
+
+        weight_quant = quant_config.target_scheme_map[matched_target].get(
+            "weights")
+        input_quant = quant_config.target_scheme_map[matched_target].get(
             "input_activations")
 
         if quant_config._is_wNa16_group_channel(weight_quant, input_quant):

From fce10dbed5441b4f918b23a2b63aae72bc00a2f6 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Wed, 27 Aug 2025 13:33:27 +0800
Subject: [PATCH 070/112] [XPU] Add xpu torch.compile support (#22609)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
---
 .buildkite/scripts/hardware_ci/run-xpu-test.sh |  1 +
 vllm/attention/layer.py                        |  3 +--
 vllm/compilation/fix_functionalization.py      |  8 ++++++++
 vllm/platforms/cpu.py                          |  4 ++++
 vllm/platforms/cuda.py                         |  4 ++++
 vllm/platforms/interface.py                    |  8 ++++++++
 vllm/platforms/rocm.py                         |  4 ++++
 vllm/platforms/xpu.py                          | 15 ++++++---------
 8 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index 445cd2735c190..73f3e63fbf5f6 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -31,6 +31,7 @@ docker run \
     set -e
     echo $ZE_AFFINITY_MASK
     VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+    VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
     VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
     VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
     cd tests
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 2d288bcbe0c95..237802afccde9 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -190,8 +190,7 @@ class Attention(nn.Module, AttentionLayerBase):
         # torch.compile works by registering the attention as one giant
         # opaque custom op. For other platforms, we directly call them
         # and let torch.compile handle them.
-        self.use_direct_call = not current_platform.is_cuda_alike(
-        ) and not current_platform.is_cpu()
+        self.use_direct_call = not current_platform.opaque_attention_op()
 
         self.use_output = self.attn_backend.accept_output_buffer
         compilation_config = get_current_vllm_config().compilation_config
diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py
index 286221d32c1ee..60ae143318790 100644
--- a/vllm/compilation/fix_functionalization.py
+++ b/vllm/compilation/fix_functionalization.py
@@ -9,6 +9,7 @@ import torch
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
 
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 
 from .fx_utils import is_func
 from .vllm_inductor_pass import VllmInductorPass
@@ -26,6 +27,13 @@ class FixFunctionalizationPass(VllmInductorPass):
     """
 
     def __call__(self, graph: torch.fx.Graph):
+        # XPU does not support auto-functionalization yet.
+        # Will enable this when switch to vllm-xpu-kernels.
+        if current_platform.is_xpu():
+            logger.debug("XPU platform does not support fix functionalization"
+                         "pass currently.")
+            return
+
         self.begin()
         self.dump_graph(graph, "before_fix_functionalization")
 
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index c748595a71534..5686fae5cd7d1 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -335,3 +335,7 @@ class CpuPlatform(Platform):
         return (cls.supports_v1(model_config)
                 and arch in (CpuArchEnum.X86, CpuArchEnum.POWERPC,
                              CpuArchEnum.ARM, CpuArchEnum.S390X))
+
+    @classmethod
+    def opaque_attention_op(cls) -> bool:
+        return True
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index c0e0fe35e4024..5cbb7346436ef 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -442,6 +442,10 @@ class CudaPlatformBase(Platform):
     def use_custom_allreduce(cls) -> bool:
         return True
 
+    @classmethod
+    def opaque_attention_op(cls) -> bool:
+        return True
+
     @classmethod
     def get_static_graph_wrapper_cls(cls) -> str:
         return "vllm.compilation.cuda_graph.CUDAGraphWrapper"
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index f6c17de86d05a..01f3e2d977bc3 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -509,6 +509,14 @@ class Platform:
         """
         return False
 
+    @classmethod
+    def opaque_attention_op(cls) -> bool:
+        """
+        Returns True if we register attention as one giant opaque custom op
+        on the current platform
+        """
+        return False
+
     @classmethod
     def validate_request(
         cls,
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 85b2fe2e480c8..c6d14aa87c7f2 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -411,6 +411,10 @@ class RocmPlatform(Platform):
         supported_archs = ['gfx94', 'gfx95']
         return any(gfx in gcn_arch for gfx in supported_archs)
 
+    @classmethod
+    def opaque_attention_op(cls) -> bool:
+        return True
+
     @classmethod
     def get_cu_count(cls, device_id: int = 0) -> int:
         return torch.cuda.get_device_properties(
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 235e5d8294e52..84f4cd7256465 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -90,21 +90,14 @@ class XPUPlatform(Platform):
         if cache_config and cache_config.block_size is None:
             cache_config.block_size = 64
 
-        # FIXME: Temporarily forcing eager mode
-        # remove after t.compile support stabilizes.
-        if (envs.VLLM_USE_V1 and model_config is not None
-                and not vllm_config.model_config.enforce_eager):
-            from vllm.config import CompilationLevel
-            vllm_config.compilation_config.level = CompilationLevel.NO_COMPILATION  # noqa: E501
-
         # lazy import to avoid circular import
         from vllm.config import CUDAGraphMode
         compilation_config = vllm_config.compilation_config
         if compilation_config.cudagraph_mode is None or \
                 compilation_config.cudagraph_mode.max_cudagraph_mode() \
                     != CUDAGraphMode.NONE:
-            logger.info("[XPU] CUDA graph is not supported on XPU, "
-                        "disabling cudagraphs.")
+            logger.info("[XPU] CUDA graph is not supported on XPU, disabling "
+                        "cudagraphs. Fallback to cudagraph_mode=NONE")
             compilation_config.cudagraph_mode = CUDAGraphMode.NONE
 
         # check and update parallel config
@@ -182,3 +175,7 @@ class XPUPlatform(Platform):
                     "Intel Arc A770 have bfloat16 accuracy known issue. "
                     "You can use float16 instead by explicitly setting the "
                     "`dtype` flag in CLI, for example: --dtype=half.")
+
+    @classmethod
+    def opaque_attention_op(cls) -> bool:
+        return True

From 9de25c294b92e42a12d1fbbb3ab3f633fa80291c Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 27 Aug 2025 13:51:50 +0800
Subject: [PATCH 071/112] [CI/Build] Remove redundant LoRA model tests (#23706)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/conftest.py      |   5 --
 tests/lora/test_baichuan.py | 112 ------------------------------------
 tests/lora/test_phi.py      |  71 -----------------------
 3 files changed, 188 deletions(-)
 delete mode 100644 tests/lora/test_baichuan.py
 delete mode 100644 tests/lora/test_phi.py

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index cba573b63c045..3475993ff8f07 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -216,11 +216,6 @@ def tinyllama_lora_files():
     return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
 
 
-@pytest.fixture(scope="session")
-def phi2_lora_files():
-    return snapshot_download(repo_id="isotr0py/phi-2-test-sql-lora")
-
-
 @pytest.fixture
 def reset_default_device():
     """
diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py
deleted file mode 100644
index 774ebb9db2106..0000000000000
--- a/tests/lora/test_baichuan.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-
-import vllm
-from vllm.distributed import cleanup_dist_env_and_memory
-from vllm.lora.request import LoRARequest
-
-MODEL_PATH = "baichuan-inc/Baichuan-7B"
-
-PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501
-
-
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
-    prompts = [
-        PROMPT_TEMPLATE.format(query="How many singers do we have?"),
-        PROMPT_TEMPLATE.format(
-            query=
-            "What is the average, minimum, and maximum age of all singers from France?"  # noqa: E501
-        ),
-        PROMPT_TEMPLATE.format(
-            query=
-            "Show name, country, age for all singers ordered by age from the oldest to the youngest."  # noqa: E501
-        ),
-    ]
-    print(prompts)
-    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256)
-    outputs = llm.generate(
-        prompts,
-        sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None)
-    # Print the outputs.
-    generated_texts: list[str] = []
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text.strip()
-        generated_texts.append(generated_text)
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-    return generated_texts
-
-
-def test_baichuan_lora(baichuan_lora_files):
-    llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=1024,
-                   enable_lora=True,
-                   max_loras=4,
-                   max_lora_rank=64,
-                   trust_remote_code=True)
-
-    expected_lora_output = [
-        "SELECT count(*) FROM singer",
-        "SELECT avg(age) ,  min(age) ,  max(age) FROM singer WHERE Country  =  'France'",  # noqa: E501
-        "SELECT name ,  country ,  age FROM singer ORDER BY age ASC",
-    ]
-
-    output1 = do_sample(llm, baichuan_lora_files, lora_id=1)
-    for i in range(len(expected_lora_output)):
-        assert output1[i] == expected_lora_output[i]
-    output2 = do_sample(llm, baichuan_lora_files, lora_id=2)
-    for i in range(len(expected_lora_output)):
-        assert output2[i] == expected_lora_output[i]
-
-
-@pytest.mark.parametrize("fully_sharded", [True, False])
-def test_baichuan_tensor_parallel_equality(baichuan_lora_files,
-                                           num_gpus_available, fully_sharded):
-    if num_gpus_available < 4:
-        pytest.skip(f"Not enough GPUs for tensor parallelism {4}")
-
-    llm_tp1 = vllm.LLM(MODEL_PATH,
-                       enable_lora=True,
-                       max_num_seqs=16,
-                       max_loras=4,
-                       max_lora_rank=64,
-                       trust_remote_code=True,
-                       fully_sharded_loras=fully_sharded)
-    output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1)
-
-    del llm_tp1
-    cleanup_dist_env_and_memory()
-
-    llm_tp2 = vllm.LLM(MODEL_PATH,
-                       enable_lora=True,
-                       max_num_seqs=16,
-                       max_loras=4,
-                       max_lora_rank=64,
-                       tensor_parallel_size=2,
-                       trust_remote_code=True,
-                       fully_sharded_loras=fully_sharded)
-    output_tp2 = do_sample(llm_tp2, baichuan_lora_files, lora_id=2)
-
-    del llm_tp2
-    cleanup_dist_env_and_memory()
-
-    assert output_tp1 == output_tp2
-
-    llm_tp4 = vllm.LLM(MODEL_PATH,
-                       enable_lora=True,
-                       max_num_seqs=16,
-                       max_loras=4,
-                       max_lora_rank=64,
-                       tensor_parallel_size=4,
-                       trust_remote_code=True,
-                       fully_sharded_loras=fully_sharded)
-    output_tp4 = do_sample(llm_tp4, baichuan_lora_files, lora_id=2)
-
-    del llm_tp4
-    cleanup_dist_env_and_memory()
-
-    assert output_tp1 == output_tp4
diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py
deleted file mode 100644
index 3090941e63679..0000000000000
--- a/tests/lora/test_phi.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import vllm
-from vllm.lora.request import LoRARequest
-
-MODEL_PATH = "microsoft/phi-2"
-
-PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:"  # noqa: E501
-
-
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
-    prompts = [
-        PROMPT_TEMPLATE.format(
-            sql_prompt=
-            "Which catalog publisher has published the most catalogs?",
-            context="CREATE TABLE catalogs (catalog_publisher VARCHAR);"),
-        PROMPT_TEMPLATE.format(
-            sql_prompt=
-            "Which trip started from the station with the largest dock count? Give me the trip id.",  # noqa: E501
-            context=
-            "CREATE TABLE trip (id VARCHAR, start_station_id VARCHAR); CREATE TABLE station (id VARCHAR, dock_count VARCHAR);"  # noqa: E501
-        ),
-        PROMPT_TEMPLATE.format(
-            sql_prompt=
-            "How many marine species are found in the Southern Ocean?",  # noqa: E501
-            context=
-            "CREATE TABLE marine_species (name VARCHAR(50), common_name VARCHAR(50), location VARCHAR(50));"  # noqa: E501
-        ),
-    ]
-    sampling_params = vllm.SamplingParams(temperature=0,
-                                          max_tokens=64,
-                                          stop="### End")
-    outputs = llm.generate(
-        prompts,
-        sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None,
-    )
-    # Print the outputs.
-    generated_texts: list[str] = []
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text.strip()
-        generated_texts.append(generated_text)
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-    return generated_texts
-
-
-def test_phi2_lora(phi2_lora_files):
-    # We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
-    # Otherwise, the lora-test will fail due to CUDA OOM.
-    llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=1024,
-                   enable_lora=True,
-                   max_loras=2,
-                   enforce_eager=True,
-                   enable_chunked_prefill=True)
-
-    expected_lora_output = [
-        "SELECT catalog_publisher, COUNT(*) as num_catalogs FROM catalogs GROUP BY catalog_publisher ORDER BY num_catalogs DESC LIMIT 1;",  # noqa: E501
-        "SELECT trip.id FROM trip JOIN station ON trip.start_station_id = station.id WHERE station.dock_count = (SELECT MAX(dock_count) FROM station);",  # noqa: E501
-        "SELECT COUNT(*) FROM marine_species WHERE location = 'Southern Ocean';",  # noqa: E501
-    ]
-
-    output1 = do_sample(llm, phi2_lora_files, lora_id=1)
-    for i in range(len(expected_lora_output)):
-        assert output1[i].startswith(expected_lora_output[i])
-    output2 = do_sample(llm, phi2_lora_files, lora_id=2)
-    for i in range(len(expected_lora_output)):
-        assert output2[i].startswith(expected_lora_output[i])

From 8dbf6ed7be3f8602257ce1879825d4b5e3554d67 Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <rongfu.leng@daocloud.io>
Date: Wed, 27 Aug 2025 13:54:39 +0800
Subject: [PATCH 072/112] [Bugfix] fix when config.yaml config value is list
 parse error (#23528)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
---
 tests/utils_/test_utils.py | 41 ++++++++++++++++++++++++++++++++++++++
 vllm/utils/__init__.py     |  9 +++++++--
 2 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/tests/utils_/test_utils.py b/tests/utils_/test_utils.py
index 084d82dee11b3..04195ea0cf92e 100644
--- a/tests/utils_/test_utils.py
+++ b/tests/utils_/test_utils.py
@@ -5,13 +5,17 @@
 import asyncio
 import hashlib
 import json
+import os
 import pickle
 import socket
+import tempfile
 from collections.abc import AsyncIterator
+from pathlib import Path
 from unittest.mock import patch
 
 import pytest
 import torch
+import yaml
 import zmq
 from transformers import AutoTokenizer
 from vllm_test_utils.monitor import monitor
@@ -991,3 +995,40 @@ def test_current_stream_multithread():
         child_thread.join(timeout=5)
         if child_thread.is_alive():
             pytest.fail("Child thread failed to exit properly")
+
+
+def test_load_config_file(tmp_path):
+    # Define the configuration data
+    config_data = {
+        "enable-logging": True,
+        "list-arg": ["item1", "item2"],
+        "port": 12323,
+        "tensor-parallel-size": 4
+    }
+
+    # Write the configuration data to a temporary YAML file
+    config_file_path = tmp_path / "config.yaml"
+    with open(config_file_path, "w") as config_file:
+        yaml.dump(config_data, config_file)
+
+    # Initialize the parser
+    parser = FlexibleArgumentParser()
+
+    # Call the function with the temporary file path
+    processed_args = parser.load_config_file(str(config_file_path))
+
+    # Expected output
+    expected_args = [
+        "--enable-logging",
+        "--list-arg",
+        "item1",
+        "item2",
+        "--port",
+        "12323",
+        "--tensor-parallel-size",
+        "4",
+    ]
+
+    # Assert that the processed arguments match the expected output
+    assert processed_args == expected_args
+    os.remove(str(config_file_path))
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 7c34a858c0a21..60bddc5b500b5 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -1974,7 +1974,7 @@ class FlexibleArgumentParser(ArgumentParser):
 
         file_path = args[index + 1]
 
-        config_args = self._load_config_file(file_path)
+        config_args = self.load_config_file(file_path)
 
         # 0th index is for {serve,chat,complete}
         # optionally followed by model_tag (only for serve)
@@ -2005,7 +2005,7 @@ class FlexibleArgumentParser(ArgumentParser):
 
         return args
 
-    def _load_config_file(self, file_path: str) -> list[str]:
+    def load_config_file(self, file_path: str) -> list[str]:
         """Loads a yaml file and returns the key value pairs as a
         flattened list with argparse like pattern
         ```yaml
@@ -2046,6 +2046,11 @@ class FlexibleArgumentParser(ArgumentParser):
             if isinstance(value, bool) and key not in store_boolean_arguments:
                 if value:
                     processed_args.append('--' + key)
+            elif isinstance(value, list):
+                if value:
+                    processed_args.append('--' + key)
+                    for item in value:
+                        processed_args.append(str(item))
             else:
                 processed_args.append('--' + key)
                 processed_args.append(str(value))

From 69244e67e6822f1c15816f887659e1ccc18c2632 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 27 Aug 2025 14:19:13 +0800
Subject: [PATCH 073/112] [Core] Use key-only cache for
 `BaseMultiModalProcessor` (#23018)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/configuration/conserving_memory.md       |   2 +-
 docs/configuration/optimization.md            |  44 +-
 .../multimodal/processing/test_common.py      |   8 +-
 tests/multimodal/test_cache.py                | 182 +++++++-
 vllm/config/__init__.py                       |  26 +-
 vllm/engine/arg_utils.py                      |  14 +-
 vllm/engine/llm_engine.py                     |  15 +-
 vllm/inputs/preprocess.py                     |  22 +-
 vllm/inputs/registry.py                       |  12 +-
 .../models/hyperclovax_vision.py              |   7 +-
 vllm/model_executor/models/llava.py           |   8 +-
 vllm/model_executor/models/minicpmv.py        |  40 +-
 vllm/model_executor/models/mistral3.py        |   8 +-
 vllm/model_executor/models/phi3v.py           |  20 +-
 vllm/model_executor/models/phi4mm.py          |  21 +-
 vllm/model_executor/models/tarsier.py         |   7 +-
 vllm/multimodal/cache.py                      | 405 +++++++++++++++++-
 vllm/multimodal/inputs.py                     |  38 +-
 vllm/multimodal/processing.py                 | 187 ++++----
 vllm/multimodal/profiling.py                  |   4 +-
 vllm/multimodal/registry.py                   |  90 ++--
 vllm/v1/engine/async_llm.py                   |   3 +-
 vllm/v1/engine/core.py                        |  17 +-
 vllm/v1/engine/llm_engine.py                  |   3 +-
 vllm/v1/engine/mm_input_cache.py              | 121 ------
 vllm/v1/engine/processor.py                   |  29 +-
 vllm/v1/worker/gpu_model_runner.py            |   3 +
 vllm/v1/worker/tpu_model_runner.py            |   3 +
 vllm/v1/worker/utils.py                       |   9 +-
 29 files changed, 954 insertions(+), 394 deletions(-)
 delete mode 100644 vllm/v1/engine/mm_input_cache.py

diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md
index 058eba5fe0b1e..efda9c8e019eb 100644
--- a/docs/configuration/conserving_memory.md
+++ b/docs/configuration/conserving_memory.md
@@ -86,7 +86,7 @@ llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct",
 
 If you run out of CPU RAM, try the following options:
 
-- (Multi-modal models only) you can set the size of multi-modal processor cache by setting `mm_processor_cache_gb` engine argument (default 4 GiB per API process + 4 GiB per engine core process)
+- (Multi-modal models only) you can set the size of multi-modal cache by setting `mm_processor_cache_gb` engine argument (default 4 GiB).
 - (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB).
 
 ## Multi-modal input limits
diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
index bb47e1b90f086..3eaf2185a559e 100644
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -204,20 +204,33 @@ vllm serve Qwen/Qwen2.5-VL-3B-Instruct --api-server-count 4 -dp 2
     to avoid CPU resource exhaustion.
 
 !!! note
-    [Multi-modal processor cache](#processor-cache) is disabled when API server scale-out is enabled
-    because it requires a one-to-one correspondence between API and engine core processes.
+    API server scale-out disables [multi-modal IPC caching](#ipc-caching)
+    because it requires a one-to-one correspondance between API and engine core processes.
+
+    This does not impact [multi-modal processor caching](#processor-caching).
 
 ## Multi-Modal Caching
 
-### Processor Cache
-
-By default, the multi-modal processor cache is enabled to avoid repeatedly processing
-the same multi-modal inputs via Hugging Face `AutoProcessor`,
+Multi-modal caching avoids repeated transfer or processing of the same multi-modal data,
 which commonly occurs in multi-turn conversations.
 
-You can adjust the size of the cache by setting the value of `mm_processor_cache_gb`
-(default 4 GiB per API process + 4 GiB per engine core process).
-If you do not benefit much from the cache, you can disable it completely via `mm_processor_cache_gb=0`.
+### Processor Caching
+
+Multi-modal processor caching is automatically enabled
+to avoid repeatedly processing the same multi-modal inputs in `BaseMultiModalProcessor`.
+
+### IPC Caching
+
+Multi-modal IPC caching is automatically enabled when
+there is a one-to-one correspondance between API (`P0`) and engine core (`P1`) processes,
+to avoid repeatedly transferring the same multi-modal inputs between them.
+
+### Configuration
+
+You can adjust the size of the cache by setting the value of `mm_processor_cache_gb` (default 4 GiB).
+
+If you do not benefit much from the cache, you can disable both IPC
+and processor caching completely via `mm_processor_cache_gb=0`.
 
 Examples:
 
@@ -230,3 +243,16 @@ llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
 llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
           mm_processor_cache_gb=0)
 ```
+
+### Cache Placement
+
+Based on the configuration, the content of the multi-modal caches on `P0` and `P1` are as follows:
+
+| Processor Caching | IPC Caching | `P0` Cache | `P1` Cache | Max. Memory |
+|-------------------|-------------|------------|------------|-------------|
+| ✅ | ✅ | K | K + V | `mm_processor_cache_gb * data_parallel_size` |
+| ✅ | ❌ | K + V | N/A | `mm_processor_cache_gb * api_server_count` |
+| ❌ | ❌ | N/A | N/A | `0` |
+
+K: Stores the hashes of multi-modal items  
+V: Stores the processed tensor data of multi-modal items
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 6361cb9b5586a..3ff4360b83345 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -14,8 +14,9 @@ from PIL import Image
 from vllm.config import ModelConfig
 from vllm.inputs import InputProcessingContext
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
+from vllm.multimodal.cache import MultiModalProcessorOnlyCache
 from vllm.multimodal.inputs import MultiModalInputs
-from vllm.multimodal.processing import BaseMultiModalProcessor, ProcessingCache
+from vllm.multimodal.processing import BaseMultiModalProcessor
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
                                                cached_tokenizer_from_config,
                                                encode_tokens)
@@ -63,6 +64,8 @@ def _test_processing_correctness(
         revision=model_info.revision,
         trust_remote_code=model_info.trust_remote_code,
         hf_overrides=model_info.hf_overrides,
+        # Ensure that the cache can fit all of the data
+        mm_processor_cache_gb=2048,
     )
 
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
@@ -71,8 +74,7 @@ def _test_processing_correctness(
         model_config,
         tokenizer=cached_tokenizer_from_config(model_config),
     )
-    # Ensure that it can fit all of the data
-    cache = ProcessingCache(capacity_gb=2048)
+    cache = MultiModalProcessorOnlyCache(model_config)
 
     processing_info = factories.info(ctx)
     supported_mm_limits = processing_info.get_supported_mm_limits()
diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py
index 088cd00db2e04..44c05db2278f7 100644
--- a/tests/multimodal/test_cache.py
+++ b/tests/multimodal/test_cache.py
@@ -1,32 +1,64 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import numpy as np
 import pytest
 import torch
 
-from vllm.multimodal.cache import MultiModalCache, MultiModalCacheItemMetadata
+from vllm.config import ModelConfig, ParallelConfig, VllmConfig
+from vllm.multimodal.cache import (MultiModalCache,
+                                   MultiModalProcessorCacheItem,
+                                   MultiModalProcessorCacheItemMetadata,
+                                   processor_cache_from_config,
+                                   receiver_cache_from_config)
+from vllm.multimodal.hasher import MultiModalHasher
 from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargsItem,
                                     MultiModalKwargsItems,
                                     MultiModalSharedField)
+from vllm.multimodal.processing import PromptInsertion
+from vllm.multimodal.registry import MultiModalRegistry
 
 
-def _dummy_elem(modality: str, key: str, size: int):
+def _dummy_elem(
+    modality: str,
+    key: str,
+    size: int,
+    *,
+    rng: Optional[np.random.RandomState] = None,
+):
+    if rng is None:
+        data = torch.empty((size, ), dtype=torch.int8)
+    else:
+        data = torch.from_numpy(rng.randint(4, size=(size, ), dtype=np.int8))
+
     return MultiModalFieldElem(
         modality=modality,
         key=key,
-        data=torch.empty((size, ), dtype=torch.int8),
+        data=data,
         field=MultiModalSharedField(1),
     )
 
 
-def _dummy_item(modality: str, size_by_key: dict[str, int]):
+def _dummy_item(
+    modality: str,
+    size_by_key: dict[str, int],
+    *,
+    rng: Optional[np.random.RandomState] = None,
+):
     return MultiModalKwargsItem.from_elems([
-        _dummy_elem(modality, key, size) for key, size in size_by_key.items()
+        _dummy_elem(modality, key, size, rng=rng)
+        for key, size in size_by_key.items()
     ])
 
 
-def _dummy_items(size_by_key_modality: dict[str, dict[str, int]]):
+def _dummy_items(
+    size_by_key_modality: dict[str, dict[str, int]],
+    *,
+    rng: Optional[np.random.RandomState] = None,
+):
     return MultiModalKwargsItems.from_seq([
-        _dummy_item(modality, size_by_key)
+        _dummy_item(modality, size_by_key, rng=rng)
         for modality, size_by_key in size_by_key_modality.items()
     ])
 
@@ -48,5 +80,139 @@ def test_cache_item_size(item, expected_size):
     cache[""] = item
     assert cache.currsize == expected_size
 
-    cache[""] = MultiModalCacheItemMetadata.wraps(item)
+    prompt_update = PromptInsertion("dummy", "target", "insertion") \
+        .resolve(0)
+
+    cache[""] = MultiModalProcessorCacheItem(item, [prompt_update])
     assert cache.currsize == expected_size
+
+    cache[""] = MultiModalProcessorCacheItemMetadata(item, [prompt_update])
+    assert cache.currsize == expected_size
+
+
+def _create_vllm_config(
+    *,
+    mm_processor_cache_gb: float,
+    enable_ipc: bool,
+):
+    return VllmConfig(
+        model_config=ModelConfig(mm_processor_cache_gb=mm_processor_cache_gb),
+        parallel_config=ParallelConfig(
+            data_parallel_size=1 if enable_ipc else 2),
+    )
+
+
+def _compare_caches(
+    config_0: VllmConfig,
+    config_1: VllmConfig,
+    *,
+    item_capacity: int = 8,
+    hit_rate: float = 0.5,
+    max_items_per_iter: int = 3,
+    is_cached_calls_per_iter: int,
+    n_iter: int = 100,
+    seed: int = 0,
+):
+    mm_registry = MultiModalRegistry()
+    cache_0_p0 = processor_cache_from_config(config_0, mm_registry)
+    cache_0_p1 = receiver_cache_from_config(config_0, mm_registry)
+    cache_1_p0 = processor_cache_from_config(config_1, mm_registry)
+    cache_1_p1 = receiver_cache_from_config(config_1, mm_registry)
+
+    cache_size_gb = max(
+        config_0.model_config.mm_processor_cache_gb,
+        config_1.model_config.mm_processor_cache_gb,
+    )
+    item_size_gb = int(cache_size_gb / item_capacity)
+
+    rng = np.random.RandomState(seed)
+    all_items = [
+        _dummy_item("item", {"key": item_size_gb}, rng=rng)
+        for _ in range(int(item_capacity / hit_rate))
+    ]
+    all_hashes = [
+        MultiModalHasher.hash_kwargs(item=item.get_data())
+        for item in all_items
+    ]
+
+    # Should not be used since there is nothing to convert to text
+    prompt_update = PromptInsertion("dummy", "target", "insertion")
+
+    for it in range(n_iter):
+        num_items_to_select = rng.randint(0, max_items_per_iter)
+        item_idxs_to_select = rng.choice(len(all_items), num_items_to_select)
+
+        selected_items = [all_items[idx] for idx in item_idxs_to_select]
+        selected_hashes = [all_hashes[idx] for idx in item_idxs_to_select]
+
+        if cache_0_p0 is None:
+            cache_0_p0_out = selected_items
+        else:
+            for _ in range(is_cached_calls_per_iter):
+                cache_0_p0.is_cached(selected_hashes)
+            cache_0_p0_out = [
+                item for item, _ in cache_0_p0.get_and_update(
+                    [(item, prompt_update.content) for item in selected_items],
+                    selected_hashes,
+                )
+            ]
+
+        if cache_1_p0 is None:
+            cache_1_p0_out = selected_items
+        else:
+            for _ in range(is_cached_calls_per_iter):
+                cache_1_p0.is_cached(selected_hashes)
+            cache_1_p0_out = [
+                item for item, _ in cache_1_p0.get_and_update(
+                    [(item, prompt_update.content) for item in selected_items],
+                    selected_hashes,
+                )
+            ]
+
+        if cache_0_p1 is None:
+            cache_0_p1_out = cache_0_p0_out
+        else:
+            cache_0_p1_out = cache_0_p1.get_and_update(cache_0_p0_out,
+                                                       selected_hashes)
+
+        if cache_1_p1 is None:
+            cache_1_p1_out = cache_1_p0_out
+        else:
+            cache_1_p1_out = cache_1_p1.get_and_update(cache_1_p0_out,
+                                                       selected_hashes)
+
+        assert cache_0_p1_out == cache_1_p1_out, f"Failed at {it=}"
+
+
+@pytest.mark.parametrize("is_cached_calls_per_iter", [1, 2, 3])
+def test_ipc_enable_disable_consistency(is_cached_calls_per_iter):
+    cache_size_gb = 1 / (1 << 20)
+
+    vllm_config_ipc_enabled = _create_vllm_config(
+        mm_processor_cache_gb=cache_size_gb,
+        enable_ipc=True,
+    )
+    vllm_config_ipc_disabled = _create_vllm_config(
+        mm_processor_cache_gb=0,
+        enable_ipc=False,
+    )
+    vllm_config_cache_disabled = _create_vllm_config(
+        mm_processor_cache_gb=cache_size_gb,
+        enable_ipc=True,
+    )
+
+    _compare_caches(
+        vllm_config_ipc_enabled,
+        vllm_config_ipc_disabled,
+        is_cached_calls_per_iter=is_cached_calls_per_iter,
+    )
+    _compare_caches(
+        vllm_config_ipc_disabled,
+        vllm_config_cache_disabled,
+        is_cached_calls_per_iter=is_cached_calls_per_iter,
+    )
+    _compare_caches(
+        vllm_config_cache_disabled,
+        vllm_config_ipc_enabled,
+        is_cached_calls_per_iter=is_cached_calls_per_iter,
+    )
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index cd0e17977edec..ac6f51df95498 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -437,7 +437,7 @@ class ModelConfig:
     from `AutoProcessor.from_pretrained`. The available overrides depend on the
     model that is being run. For example, for Phi-3-Vision: `{"num_crops": 4}`.
     """
-    mm_processor_cache_gb: int = 4
+    mm_processor_cache_gb: float = 4
     """The size (in GiB) of the multi-modal processor cache, which is used to
     avoid re-processing past multi-modal inputs.
 
@@ -884,12 +884,6 @@ class ModelConfig:
 
         return None
 
-    def set_mm_processor_cache_gb(self, value: int) -> None:
-        mm_config = self.get_multimodal_config()
-
-        self.mm_processor_cache_gb = value
-        mm_config.mm_processor_cache_gb = value
-
     def _get_encoder_config(self):
         return get_sentence_transformer_tokenizer_config(
             self.model, self.revision)
@@ -1697,22 +1691,6 @@ class ModelConfig:
     def is_multimodal_model(self) -> bool:
         return self.multimodal_config is not None
 
-    @property
-    def enable_mm_processor_cache(self) -> bool:
-        """Whether the multi-modal processor cache should be enabled."""
-        mm_config = self.multimodal_config
-        if mm_config is None:
-            return False
-
-        return mm_config.mm_processor_cache_gb > 0
-
-    def get_mm_input_cache_gb(self) -> int:
-        mm_config = self.multimodal_config
-        if mm_config is None:
-            return 0
-
-        return envs.VLLM_MM_INPUT_CACHE_GIB
-
     @property
     def is_cross_encoder(self) -> bool:
         return (self._model_info.supports_cross_encoding
@@ -2561,7 +2539,7 @@ class MultiModalConfig:
     `{"num_crops": 4}`.
     """
 
-    mm_processor_cache_gb: int = 4
+    mm_processor_cache_gb: float = 4
     """
     The size (in GiB) of the multi-modal processor cache, which is used to
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index f24c50ad73261..9e7c95ea5205f 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -351,7 +351,7 @@ class EngineArgs:
     mm_processor_kwargs: Optional[Dict[str, Any]] = \
         MultiModalConfig.mm_processor_kwargs
     disable_mm_preprocessor_cache: bool = False  # DEPRECATED
-    mm_processor_cache_gb: int = MultiModalConfig.mm_processor_cache_gb
+    mm_processor_cache_gb: float = MultiModalConfig.mm_processor_cache_gb
     mm_encoder_tp_mode: MMEncoderTPMode = MultiModalConfig.mm_encoder_tp_mode
     skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling
     # LoRA fields
@@ -1293,18 +1293,6 @@ class EngineArgs:
             worker_extension_cls=self.worker_extension_cls,
         )
 
-        if model_config.is_multimodal_model:
-            dp_supports_mm_processor_cache = (self.data_parallel_size == 1
-                                              or data_parallel_external_lb)
-            if (not dp_supports_mm_processor_cache
-                    and model_config.mm_processor_cache_gb > 0):
-                logger.warning(
-                    "Multi-modal processor cache is disabled because "
-                    "it is not compatible with data parallelism when "
-                    "there does not exist a one-to-one correspondance "
-                    "between API and engine core processes.")
-                model_config.set_mm_processor_cache_gb(0)
-
         speculative_config = self.create_speculative_config(
             target_model_config=model_config,
             target_parallel_config=parallel_config,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index cbd714c159eb5..03c2f0375da42 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -36,6 +36,7 @@ from vllm.logits_process import get_bad_words_logits_processors
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.multimodal.cache import processor_only_cache_from_config
 from vllm.multimodal.processing import EncDecMultiModalProcessor
 from vllm.outputs import (PoolingRequestOutput, RequestOutput,
                           RequestOutputFactory)
@@ -250,9 +251,13 @@ class LLMEngine:
         self.generation_config_fields = (
             self.model_config.try_get_generation_config())
 
-        self.input_preprocessor = InputPreprocessor(self.model_config,
-                                                    self.tokenizer,
-                                                    mm_registry)
+        self.input_preprocessor = InputPreprocessor(
+            self.model_config,
+            self.tokenizer,
+            mm_registry,
+            mm_processor_cache=processor_only_cache_from_config(
+                self.model_config, mm_registry),
+        )
 
         self.model_executor = executor_class(vllm_config=vllm_config)
 
@@ -840,8 +845,8 @@ class LLMEngine:
 
     def reset_mm_cache(self) -> bool:
         """Reset the multi-modal cache."""
-        return self.input_preprocessor.mm_registry.reset_processor_cache(
-            self.model_config)
+        self.input_preprocessor.clear_cache()
+        return True
 
     def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
         """Reset prefix cache for all devices."""
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 3f521012e82a2..f0d0cab3df3d9 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -11,6 +11,7 @@ from vllm.config import ModelConfig
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.multimodal.cache import BaseMultiModalProcessorCache
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs,
                                     MultiModalInputs)
 from vllm.transformers_utils.tokenizer import AnyTokenizer
@@ -32,12 +33,14 @@ class InputPreprocessor:
         model_config: ModelConfig,
         tokenizer: Optional[TokenizerGroup],
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+        mm_processor_cache: Optional[BaseMultiModalProcessorCache] = None,
     ) -> None:
         super().__init__()
 
         self.model_config = model_config
         self.tokenizer = tokenizer
         self.mm_registry = mm_registry
+        self.mm_processor_cache = mm_processor_cache
 
     def get_tokenizer_group(self) -> TokenizerGroup:
         if self.tokenizer is None:
@@ -261,8 +264,11 @@ class InputPreprocessor:
         """
         tokenizer = self._get_mm_tokenizer(lora_request)
 
-        mm_processor = self.mm_registry.create_processor(self.model_config,
-                                                         tokenizer=tokenizer)
+        mm_processor = self.mm_registry.create_processor(
+            self.model_config,
+            tokenizer=tokenizer,
+            cache=self.mm_processor_cache,
+        )
 
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
@@ -286,8 +292,12 @@ class InputPreprocessor:
         """
         tokenizer = await self._get_mm_tokenizer_async(lora_request)
 
-        mm_processor = self.mm_registry.create_processor(self.model_config,
-                                                         tokenizer=tokenizer)
+        mm_processor = self.mm_registry.create_processor(
+            self.model_config,
+            tokenizer=tokenizer,
+            cache=self.mm_processor_cache,
+        )
+
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
 
@@ -860,3 +870,7 @@ class InputPreprocessor:
             tokenization_kwargs=tokenization_kwargs,
             lora_request=lora_request,
         )
+
+    def clear_cache(self) -> None:
+        if self.mm_processor_cache is not None:
+            self.mm_processor_cache.clear_cache()
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index ef146fdfbf97c..f0b392e9767ae 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -223,20 +223,26 @@ class InputRegistry:
         The model is identified by ``model_config``.
         """
         # Avoid circular import
+        from vllm.multimodal.cache import processor_only_cache_from_config
         from vllm.sequence import SequenceData
 
         if not model_config.is_multimodal_model:
             seq_data = SequenceData.from_prompt_token_counts((0, seq_len))
             return DummyData(seq_data=seq_data)
 
+        cache = processor_only_cache_from_config(model_config, mm_registry)
+
         # Encoder dummy data does not contain multi-modal data
         if is_encoder_data:
-            enc_data = mm_registry.get_encoder_dummy_data(
-                model_config, seq_len)
+            enc_data = mm_registry.get_encoder_dummy_data(model_config,
+                                                          seq_len,
+                                                          cache=cache)
             seq_data = SequenceData.from_seqs(enc_data.prompt_token_ids)
             return DummyData(seq_data=seq_data)
 
-        dec_data = mm_registry.get_decoder_dummy_data(model_config, seq_len)
+        dec_data = mm_registry.get_decoder_dummy_data(model_config,
+                                                      seq_len,
+                                                      cache=cache)
 
         return DummyData(
             seq_data=SequenceData.from_seqs(dec_data.prompt_token_ids),
diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py
index eeb8291c77847..53f0585541b1c 100644
--- a/vllm/model_executor/models/hyperclovax_vision.py
+++ b/vllm/model_executor/models/hyperclovax_vision.py
@@ -33,12 +33,13 @@ from vllm.inputs import InputProcessingContext
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.cache import BaseMultiModalProcessorCache
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalKwargsItems)
 from vllm.multimodal.parse import ImageSize, MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, ProcessingCache,
-                                        PromptReplacement, PromptUpdate)
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
@@ -367,7 +368,7 @@ def _build_hcxvision_hf_processor(
     info: HCXVisionProcessingInfo,
     dummy_inputs: BaseDummyInputsBuilder[HCXVisionProcessingInfo],
     *,
-    cache: Optional[ProcessingCache] = None,
+    cache: Optional[BaseMultiModalProcessorCache] = None,
 ) -> BaseMultiModalProcessor:
     if isinstance(info, HCXVisionProcessingInfo):
         return HCXVisionMultiModalProcessor(
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index bc53982c938ce..0ee26b68345c3 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -22,14 +22,14 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.cache import BaseMultiModalProcessorCache
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputs, MultiModalKwargsItems)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, ProcessingCache,
-                                        PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails)
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.jsontree import json_map_leaves
@@ -394,7 +394,7 @@ def _build_llava_or_pixtral_hf_processor(
     info: _I,
     dummy_inputs: BaseDummyInputsBuilder[_I],
     *,
-    cache: Optional[ProcessingCache] = None,
+    cache: Optional[BaseMultiModalProcessorCache] = None,
 ) -> BaseMultiModalProcessor:
     if isinstance(info, PixtralHFProcessingInfo):
         return PixtralHFMultiModalProcessor(
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index a2a71bdd12b36..c22d871ab20d9 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -58,7 +58,8 @@ from vllm.multimodal.parse import (DictEmbeddingItems, ImageItem,
                                    VideoItem, VideoProcessorItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate, PromptUpdateDetails)
+                                        PromptUpdate, PromptUpdateDetails,
+                                        ResolvedPromptUpdate, _seq2text)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
@@ -744,6 +745,43 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
             for modality, pattern in placeholders
         ]
 
+    def _recompute_cached_prompt_update(
+        self,
+        cached_update: ResolvedPromptUpdate,
+        new_item_idx: int,
+    ) -> ResolvedPromptUpdate:
+        new_update = super()._recompute_cached_prompt_update(
+            cached_update,
+            new_item_idx,
+        )
+
+        if cached_update.modality == "image":
+            tokenizer = self.info.get_tokenizer()
+            image_processor = self.info.get_image_processor()
+            version = self.info.get_model_version()
+
+            text = _seq2text(tokenizer, cached_update.content.full)
+            prev_item_idx = cached_update.item_idx
+
+            if version == (2, 0) or version == (2, 5):
+                im_start = image_processor.im_start_token
+                im_end = image_processor.im_end_token
+            else:
+                im_start = image_processor.im_id_start
+                im_end = image_processor.im_id_end
+
+            new_update = new_update.with_content(
+                PromptUpdateDetails.select_text(
+                    text.replace(
+                        f"{im_start}{prev_item_idx}{im_end}",
+                        f"{im_start}{new_item_idx}{im_end}",
+                        1,
+                    ),
+                    "<unk>",
+                ))
+
+        return new_update
+
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
index 438513433d3b2..08948960b275c 100644
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -22,14 +22,14 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.cache import BaseMultiModalProcessorCache
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalKwargsItems)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, ProcessingCache,
-                                        PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails)
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
@@ -322,7 +322,7 @@ def _build_mistral3_processor(
     info: _I,
     dummy_inputs: BaseDummyInputsBuilder[_I],
     *,
-    cache: Optional[ProcessingCache] = None,
+    cache: Optional[BaseMultiModalProcessorCache] = None,
 ) -> BaseMultiModalProcessor:
     assert isinstance(info, Mistral3ProcessingInfo)
     return Mistral3MultiModalProcessor(
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 61e09d56046cc..4522c7043d01a 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -41,7 +41,8 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo,
                                         MultiModalPromptUpdates,
                                         PlaceholderFeaturesInfo,
-                                        PromptReplacement, PromptUpdate)
+                                        PromptReplacement, PromptUpdate,
+                                        ResolvedPromptUpdate)
 # yapf: enable
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
@@ -440,6 +441,23 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
             )
         ]
 
+    def _recompute_cached_prompt_update(
+        self,
+        cached_update: ResolvedPromptUpdate,
+        new_item_idx: int,
+    ) -> ResolvedPromptUpdate:
+        new_update = super()._recompute_cached_prompt_update(
+            cached_update,
+            new_item_idx,
+        )
+
+        if cached_update.modality == "image":
+            hf_processor = self.info.get_hf_processor()
+            image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
+            new_update = new_update.with_target(image_tokens[new_item_idx])
+
+        return new_update
+
     def _apply_prompt_updates(
         self,
         token_ids: list[int],
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 5129770e8d499..211cbd9c819cc 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -27,7 +27,7 @@ from vllm.multimodal.parse import (AudioProcessorItems, ImageEmbeddingItems,
                                    MultiModalDataItems, MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate)
+                                        PromptUpdate, ResolvedPromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
@@ -850,6 +850,25 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
             ),
         ]
 
+    def _recompute_cached_prompt_update(
+        self,
+        cached_update: ResolvedPromptUpdate,
+        new_item_idx: int,
+    ) -> ResolvedPromptUpdate:
+        new_update = super()._recompute_cached_prompt_update(
+            cached_update,
+            new_item_idx,
+        )
+
+        if cached_update.modality == "image":
+            image_tokens: list[str] = self.info.image_tokens  # type: ignore
+            new_update = new_update.with_target(image_tokens[new_item_idx])
+        elif cached_update.modality == "audio":
+            audio_tokens: list[str] = self.info.audio_tokens  # type: ignore
+            new_update = new_update.with_target(audio_tokens[new_item_idx])
+
+        return new_update
+
 
 @MULTIMODAL_REGISTRY.register_processor(
     Phi4MMMultiModalProcessor,
diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py
index 9b9cca8c6bd3c..c66867315e553 100644
--- a/vllm/model_executor/models/tarsier.py
+++ b/vllm/model_executor/models/tarsier.py
@@ -25,12 +25,13 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.models.llava import LlavaDummyInputsBuilder
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.cache import BaseMultiModalProcessorCache
 from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargsItems
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, ProcessingCache,
-                                        PromptReplacement, PromptUpdate)
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.jsontree import json_map_leaves
@@ -332,7 +333,7 @@ def _build_tarsier_hf_processor(
     info: _I_Tarsier,
     dummy_inputs: BaseDummyInputsBuilder[_I_Tarsier],
     *,
-    cache: Optional[ProcessingCache] = None,
+    cache: Optional[BaseMultiModalProcessorCache] = None,
 ) -> BaseMultiModalProcessor:
     if isinstance(info, TarsierProcessingInfo):
         return TarsierMultiModalProcessor(
diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py
index 5cec8e71fb265..0e81cb6d4d190 100644
--- a/vllm/multimodal/cache.py
+++ b/vllm/multimodal/cache.py
@@ -1,11 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import sys
-from collections.abc import Mapping
-from dataclasses import dataclass
-from typing import TypeVar, Union
+from abc import ABC, abstractmethod
+from collections.abc import Mapping, Sequence
+from typing import TYPE_CHECKING, Generic, Optional, TypeVar, Union
 
 import torch
+from typing_extensions import TypeAlias, override
 
 from vllm.logger import init_logger
 from vllm.utils import GiB_bytes, LRUCache
@@ -15,24 +16,67 @@ from .inputs import (MultiModalFieldElem, MultiModalKwargs,
                      MultiModalKwargsItem, MultiModalKwargsItems,
                      NestedTensors)
 
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig, VllmConfig
+
+    from .processing import ResolvedPromptUpdate
+    from .registry import MultiModalRegistry
+
 logger = init_logger(__name__)
 
 
-@dataclass
-class MultiModalCacheItemMetadata:
-    size: int
+class MultiModalProcessorCacheItem:
+    """
+    The data to store inside `MultiModalProcessorOnlyCache`.
 
-    @classmethod
-    def wraps(cls, value: "MultiModalCacheValue"):
-        return cls(size=MultiModalCache.get_item_size(value))
+    Args:
+        item: The processed tensor data corresponding to a multi-modal item.
+        prompt_updates: The prompt updates corresponding to `item`.
+    """
+
+    def __init__(
+        self,
+        item: MultiModalKwargsItem,
+        prompt_updates: Sequence["ResolvedPromptUpdate"],
+    ) -> None:
+        super().__init__()
+
+        self.item = item
+        self.prompt_updates = prompt_updates
+
+
+class MultiModalProcessorCacheItemMetadata:
+    """
+    The metadata to store inside `MultiModalProcessorSenderCache`.
+
+    Args:
+        item: The processed tensor data corresponding to a multi-modal item.
+            Since P1 already stores the tensor data, we only store its size
+            metadata in P0 to reduce memory usage. The size metadata is still
+            needed to keep the same cache eviction policy as P0.
+        prompt_updates: The prompt updates corresponding to `item`.
+            This needs to stay on P0 because for some models, they are
+            dependent on the processed tensor data (cached on P1).
+    """
+
+    def __init__(
+        self,
+        item: MultiModalKwargsItem,
+        prompt_updates: Sequence["ResolvedPromptUpdate"],
+    ) -> None:
+        super().__init__()
+
+        self.item_size = MultiModalCache.get_item_size(item)
+        self.prompt_updates = prompt_updates
 
 
 MultiModalCacheValue = Union[
+    MultiModalProcessorCacheItem,
+    MultiModalProcessorCacheItemMetadata,
     MultiModalKwargsItems,
     MultiModalKwargsItem,
     MultiModalKwargs,
     Mapping[str, NestedTensors],
-    MultiModalCacheItemMetadata,
 ]
 
 _V = TypeVar("_V", bound=MultiModalCacheValue)
@@ -47,8 +91,10 @@ class MultiModalCache:
         *,
         debug: bool = False,
     ) -> int:
-        if isinstance(leaf, MultiModalFieldElem):
-            return cls.get_item_size(leaf.data)  # type: ignore
+        if isinstance(leaf, MultiModalProcessorCacheItem):
+            return cls.get_leaf_size(leaf.item)
+        if isinstance(leaf, MultiModalProcessorCacheItemMetadata):
+            return leaf.item_size
 
         # These are not subclasses of dict
         if isinstance(leaf, MultiModalKwargsItems):
@@ -58,13 +104,13 @@ class MultiModalCache:
         if isinstance(leaf, MultiModalKwargs):
             return cls.get_item_size(leaf.data)  # type: ignore
 
+        if isinstance(leaf, MultiModalFieldElem):
+            return cls.get_item_size(leaf.data)  # type: ignore
+
         # sys.getsizeof doesn't work for tensors
         if isinstance(leaf, torch.Tensor):
             return leaf.nbytes
 
-        if isinstance(leaf, MultiModalCacheItemMetadata):
-            return leaf.size
-
         return sys.getsizeof(leaf)
 
     @classmethod
@@ -98,3 +144,332 @@ class MultiModalCache:
             GiB_bytes * capacity_gb,
             getsizeof=lambda x: cls.get_item_size(x, debug=debug),
         )
+
+
+_I = TypeVar("_I", contravariant=True)
+_O = TypeVar("_O", covariant=True)
+
+
+class BaseMultiModalCache(ABC, Generic[_I, _O]):
+    """
+    Abstract base class to read/write multi-modal items from cache.
+
+    The idea of multi-modal caching is based on having a client and server
+    where the client executes in the frontend process (=P0) and
+    the server in the core process (=P1). The data flow is as follows:
+
+    ```
+                  is_cached() x N    get_and_update()
+    P0: From API -----------------> -----------------> To P1
+
+                 get_and_update()
+    P1: From P0 -----------------> To model
+    ```
+
+    `is_cached()` can be called any number of times in P0. However,
+    `get_and_update()` must be called in P0 and P1 one after another
+    so that their cache eviction order remains the same.
+
+    This ensures that the keys in P0 and P1 caches are mirrored,
+    allowing us to determine whether a key is cached in P1 by looking
+    up the P0 cache, without having to communicate with P1.
+    """
+
+    @abstractmethod
+    def get_and_update_item(
+        self,
+        mm_item: _I,
+        mm_hash: str,
+    ) -> _O:
+        """
+        Possibly update a multi-modal item based on whether it is
+        in the underlying cache.
+        
+        This update is done out-of-place and updates the cache eviction order.
+
+        Args:
+            mm_item: The multi-modal item to update.
+            mm_hash: The hash of `mm_item`.
+
+        Returns:
+            The update multi-modal item.
+        """
+        raise NotImplementedError
+
+    def get_and_update(
+        self,
+        mm_items: Sequence[_I],
+        mm_hashes: list[str],
+    ) -> list[_O]:
+        """
+        Possibly update a sequence of multi-modal items based on whether they
+        are in the underlying cache.
+
+        This update is done out-of-place and updates the cache eviction order.
+
+        Args:
+            mm_items: The multi-modal items to update.
+            mm_hashes: The hash of each item in `mm_items`.
+
+        Returns:
+            A new list of updated multi-modal items.
+        """
+        assert len(mm_items) == len(mm_hashes)
+
+        return [
+            self.get_and_update_item(mm_item, mm_hash)
+            for mm_item, mm_hash in zip(mm_items, mm_hashes)
+        ]
+
+    @abstractmethod
+    def clear_cache(self) -> None:
+        """Clear the underlying cache."""
+        raise NotImplementedError
+
+
+MultiModalProcessorCacheInItem: TypeAlias = \
+    Optional[tuple[MultiModalKwargsItem, Sequence["ResolvedPromptUpdate"]]]
+
+
+MultiModalProcessorCacheOutItem: TypeAlias = \
+    tuple[Optional[MultiModalKwargsItem], Sequence["ResolvedPromptUpdate"]]
+
+
+class BaseMultiModalProcessorCache(
+        BaseMultiModalCache[MultiModalProcessorCacheInItem,
+                            MultiModalProcessorCacheOutItem]):
+    """The required interface for caches on P0."""
+
+    @abstractmethod
+    def is_cached_item(self, mm_hash: str) -> bool:
+        """
+        Check whether a multi-modal item is
+        in the underlying cache.
+
+        This **DOES NOT** update the cache eviction order.
+
+        Args:
+            mm_hash: The hash of the item to check.
+
+        Returns:
+            `True` if the item is cached, otherwise `False`.
+        """
+        raise NotImplementedError
+
+    def is_cached(self, mm_hashes: list[str]) -> list[bool]:
+        """
+        Check whether a sequence of multi-modal items are
+        in the underlying cache.
+
+        This **DOES NOT** update the cache eviction order.
+    
+        Args:
+            mm_hashes: The hash of each item to check.
+
+        Returns:
+            For each item, `True` if the item is cached, otherwise `False`.
+        """
+        return [self.is_cached_item(mm_hash) for mm_hash in mm_hashes]
+
+
+class MultiModalProcessorOnlyCache(BaseMultiModalProcessorCache):
+    """
+    The cache which is used on P0 when IPC caching is disabled.
+
+    How to update each item:
+
+    - If the item is in the cache, replace the input with the cached item.
+    - If the item is not in the cache, store that item (which includes
+      tensor data and metadata) into the cache, and return the input.
+    """
+
+    def __init__(self, model_config: "ModelConfig") -> None:
+        super().__init__()
+
+        mm_config = model_config.get_multimodal_config()
+
+        self._cache = MultiModalCache.get_lru_cache(
+            mm_config.mm_processor_cache_gb,
+            MultiModalProcessorCacheItem,
+        )
+
+    @override
+    def is_cached_item(self, mm_hash: str) -> bool:
+        return mm_hash in self._cache
+
+    @override
+    def get_and_update_item(
+        self,
+        mm_item: MultiModalProcessorCacheInItem,
+        mm_hash: str,
+    ) -> MultiModalProcessorCacheOutItem:
+        if (cached_item := self._cache.get(mm_hash)) is not None:
+            return cached_item.item, cached_item.prompt_updates
+
+        assert mm_item is not None, f"Expected a cached item for {mm_hash=}"
+
+        self._cache[mm_hash] = MultiModalProcessorCacheItem(*mm_item)
+
+        return mm_item
+
+    @override
+    def clear_cache(self) -> None:
+        self._cache.clear()
+
+
+class MultiModalProcessorSenderCache(BaseMultiModalProcessorCache):
+    """
+    The cache which is used on P0 when IPC caching is enabled.
+
+    How to update each item:
+
+    - If the item is already in the cache, clear the input to avoid
+      unnecessary IPC.
+
+    - If the item is not in the cache, store the metadata of that item so
+      that the eviction policy remains the same as the cache on P1,
+      and return the input.
+      By only storing the metadata, we avoid keeping the data itself in
+      memory inside P0.
+    """
+
+    def __init__(self, model_config: "ModelConfig") -> None:
+        super().__init__()
+
+        mm_config = model_config.get_multimodal_config()
+
+        self._cache = MultiModalCache.get_lru_cache(
+            mm_config.mm_processor_cache_gb,
+            MultiModalProcessorCacheItemMetadata,
+        )
+
+    @override
+    def is_cached_item(self, mm_hash: str) -> bool:
+        return mm_hash in self._cache
+
+    @override
+    def get_and_update_item(
+        self,
+        mm_item: MultiModalProcessorCacheInItem,
+        mm_hash: str,
+    ) -> MultiModalProcessorCacheOutItem:
+        if (cached_item := self._cache.get(mm_hash)) is not None:
+            return None, cached_item.prompt_updates
+
+        assert mm_item is not None, f"Expected a cached item for {mm_hash=}"
+
+        self._cache[mm_hash] = MultiModalProcessorCacheItemMetadata(*mm_item)
+
+        return mm_item
+
+    @override
+    def clear_cache(self) -> None:
+        self._cache.clear()
+
+
+def _enable_processor_cache(
+    model_config: "ModelConfig",
+    mm_registry: "MultiModalRegistry",
+) -> bool:
+    if not mm_registry.supports_multimodal_inputs(model_config):
+        return False
+
+    mm_config = model_config.get_multimodal_config()
+    return mm_config.mm_processor_cache_gb > 0
+
+
+def _enable_ipc_cache(vllm_config: "VllmConfig") -> bool:
+    parallel_config = vllm_config.parallel_config
+    supports_ipc_cache = (parallel_config.data_parallel_size == 1
+                          or parallel_config.data_parallel_external_lb)
+
+    return supports_ipc_cache
+
+
+def processor_cache_from_config(
+    vllm_config: "VllmConfig",
+    mm_registry: "MultiModalRegistry",
+) -> Optional[BaseMultiModalProcessorCache]:
+    """Return a `BaseMultiModalProcessorCache`, if enabled."""
+    model_config = vllm_config.model_config
+
+    if not _enable_processor_cache(model_config, mm_registry):
+        return None
+
+    if not _enable_ipc_cache(vllm_config):
+        return MultiModalProcessorOnlyCache(model_config)
+
+    return MultiModalProcessorSenderCache(model_config)
+
+
+def processor_only_cache_from_config(
+    model_config: "ModelConfig",
+    mm_registry: "MultiModalRegistry",
+):
+    """Return a `MultiModalProcessorOnlyCache`, if enabled."""
+    if not _enable_processor_cache(model_config, mm_registry):
+        return None
+
+    return MultiModalProcessorOnlyCache(model_config)
+
+
+class BaseMultiModalReceiverCache(
+        BaseMultiModalCache[Optional[MultiModalKwargsItem],
+                            MultiModalKwargsItem]):
+    """The required interface for caches on P1."""
+
+
+class MultiModalReceiverCache(BaseMultiModalReceiverCache):
+    """
+    The cache which is used on P1 when IPC caching is enabled.
+
+    How to update each item:
+
+    - If the item is in the cache, replace the input with the cached item.
+    - If the item is not in the cache, store that item (which includes tensor
+      data) into the cache, and return the input.
+    """
+
+    def __init__(self, model_config: "ModelConfig") -> None:
+        super().__init__()
+
+        mm_config = model_config.get_multimodal_config()
+
+        self._cache = MultiModalCache.get_lru_cache(
+            mm_config.mm_processor_cache_gb,
+            MultiModalKwargsItem,
+        )
+
+    @override
+    def get_and_update_item(
+        self,
+        mm_item: Optional[MultiModalKwargsItem],
+        mm_hash: str,
+    ) -> MultiModalKwargsItem:
+        if (cached_item := self._cache.get(mm_hash)) is not None:
+            return cached_item
+
+        assert mm_item is not None, f"Expected a cached item for {mm_hash=}"
+
+        self._cache[mm_hash] = mm_item
+        return mm_item
+
+    @override
+    def clear_cache(self) -> None:
+        self._cache.clear()
+
+
+def receiver_cache_from_config(
+    vllm_config: "VllmConfig",
+    mm_registry: "MultiModalRegistry",
+) -> Optional[BaseMultiModalReceiverCache]:
+    """Return a `BaseMultiModalReceiverCache`, if enabled."""
+    model_config = vllm_config.model_config
+
+    if not _enable_processor_cache(model_config, mm_registry):
+        return None
+
+    if not _enable_ipc_cache(vllm_config):
+        return None
+
+    return MultiModalReceiverCache(model_config)
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 581f9a109cce6..2c0ebaced67ef 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -7,11 +7,11 @@ from collections.abc import Mapping, Sequence
 from dataclasses import dataclass
 from functools import partial
 from itertools import accumulate
-from typing import (TYPE_CHECKING, Any, Literal, Optional, TypedDict, TypeVar,
-                    Union, cast, final)
+from typing import (TYPE_CHECKING, Any, Literal, Optional, TypedDict, Union,
+                    cast, final)
 
 import numpy as np
-from typing_extensions import NotRequired, TypeAlias, deprecated
+from typing_extensions import NotRequired, TypeAlias, TypeVar, deprecated
 
 from vllm.utils import LazyLoader, full_groupby, is_list_of
 from vllm.utils.jsontree import JSONTree, json_map_leaves
@@ -668,7 +668,15 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
         return {key: elem.data for key, elem in self.items()}
 
 
-class MultiModalKwargsItems(UserDict[str, Sequence[MultiModalKwargsItem]]):
+_I = TypeVar(
+    "_I",
+    MultiModalKwargsItem,
+    Optional[MultiModalKwargsItem],
+    default=MultiModalKwargsItem,
+)
+
+
+class MultiModalKwargsItems(UserDict[str, Sequence[_I]]):
     """
     A dictionary of
     [`MultiModalKwargsItem`][vllm.multimodal.inputs.MultiModalKwargsItem]s
@@ -714,27 +722,37 @@ class MultiModalKwargsItems(UserDict[str, Sequence[MultiModalKwargsItem]]):
         items_by_modality = full_groupby(items, key=lambda x: x.modality)
         return MultiModalKwargsItems(items_by_modality)
 
-    def __getitem__(self, modality: str):
+    def __getitem__(self, modality: str) -> Sequence[_I]:
         if modality not in self:
             raise KeyError(f"Modality {modality!r} not found. "
                            f"Available modalities: {set(self.keys())}")
 
-        return super().__getitem__(modality)
+        return super().__getitem__(modality)  # type: ignore[return-value]
 
     def get_data(self, *, pin_memory: bool = False) -> "MultiModalKwargs":
         elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
-        for items in self.values():
-            for item in items:
+        for modality, items in self.items():
+            for i, item in enumerate(items):
+                if item is None:
+                    raise RuntimeError("Cannot build data from empty "
+                                       f"mm_items[{modality}][{i}]")
+
                 for key, elem in item.items():
                     elems_by_key[key].append(elem)
 
         return MultiModalKwargs({
             key:
             elems[0].field.reduce_data(elems, pin_memory=pin_memory)
-            for key, elems in elems_by_key.items() if len(elems) > 0
+            for key, elems in elems_by_key.items()
         })
 
 
+MultiModalKwargsOptionalItems: TypeAlias = Union[
+    MultiModalKwargsItems[MultiModalKwargsItem],
+    MultiModalKwargsItems[Optional[MultiModalKwargsItem]],
+]
+
+
 class MultiModalKwargs(UserDict[str, NestedTensors]):
     """
     A dictionary that represents the keyword arguments to
@@ -898,7 +916,7 @@ class MultiModalInputs(TypedDict):
     token_type_ids: NotRequired[list[int]]
     """The token type IDs of the prompt."""
 
-    mm_kwargs: MultiModalKwargsItems
+    mm_kwargs: MultiModalKwargsOptionalItems
     """Keyword arguments to be directly passed to the model after batching."""
 
     mm_hashes: "MultiModalHashDict"
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 8c225e2a3c086..6ecdf80d4aa6f 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -4,7 +4,7 @@ from abc import ABC, abstractmethod
 from collections import defaultdict
 from collections.abc import (Callable, Generator, ItemsView, Iterable, Mapping,
                              Sequence)
-from dataclasses import dataclass, field
+from dataclasses import dataclass, field, replace
 from enum import Enum
 from functools import lru_cache
 from typing import (TYPE_CHECKING, Generic, NamedTuple, Optional, Protocol,
@@ -20,12 +20,11 @@ from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens,
                                                encode_tokens)
 from vllm.utils import flatten_2d_lists, full_groupby
 
-from .cache import MultiModalCache
 from .hasher import MultiModalHasher
 from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
                      MultiModalFieldConfig, MultiModalInputs,
                      MultiModalKwargsItem, MultiModalKwargsItems,
-                     PlaceholderRange)
+                     MultiModalKwargsOptionalItems, PlaceholderRange)
 from .parse import (DictEmbeddingItems, EmbeddingItems, MultiModalDataItems,
                     MultiModalDataParser)
 
@@ -34,6 +33,7 @@ if TYPE_CHECKING:
     from transformers.feature_extraction_utils import BatchFeature
     from transformers.processing_utils import ProcessorMixin
 
+    from .cache import BaseMultiModalProcessorCache
     from .profiling import BaseDummyInputsBuilder
 
 logger = init_logger(__name__)
@@ -557,6 +557,15 @@ class ResolvedPromptUpdate:
 
         return self.iter_token_matches(prompt, tokenizer, start_idx=start_idx)
 
+    def with_target(self, target: UpdateTarget):
+        return replace(self, target=target)
+
+    def with_content(self, content: PromptUpdateInfo):
+        if not isinstance(content, PromptUpdateDetails):
+            content = PromptUpdateDetails.from_seq(content)
+
+        return replace(self, content=content)
+
 
 class _TokenMatch(NamedTuple):
     start_idx: int
@@ -865,21 +874,6 @@ def find_mm_placeholders(
     return dict(full_groupby_modality(it))
 
 
-class ProcessingCache(MultiModalCache):
-
-    def __init__(self, capacity_gb: float) -> None:
-        super().__init__()
-
-        self._cache = self.get_lru_cache(capacity_gb, MultiModalKwargsItem)
-
-        self.get = self._cache.get
-        self.put = self._cache.put
-        self.reset = self._cache.clear
-
-
-_CacheItemOrHash = Union[MultiModalKwargsItem, str]
-
-
 class BaseProcessingInfo:
     """Base class to provide the information necessary for data processing."""
 
@@ -982,7 +976,7 @@ For an item `MultiModalPromptUpdates[k][i]`,
 
 
 class MultiModalProcessingInfo(NamedTuple):
-    kwargs: MultiModalKwargsItems
+    kwargs: MultiModalKwargsOptionalItems
     hashes: MultiModalHashes
     prompt_updates: MultiModalPromptUpdates
 
@@ -994,11 +988,13 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
     Not to be confused with `transformers.ProcessorMixin`.
     """
 
-    def __init__(self,
-                 info: _I,
-                 dummy_inputs: "BaseDummyInputsBuilder[_I]",
-                 *,
-                 cache: Optional[ProcessingCache] = None) -> None:
+    def __init__(
+        self,
+        info: _I,
+        dummy_inputs: "BaseDummyInputsBuilder[_I]",
+        *,
+        cache: Optional["BaseMultiModalProcessorCache"] = None,
+    ) -> None:
         super().__init__()
 
         self.info = info
@@ -1355,32 +1351,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
 
         return prompt_ids, mm_processed_data, False
 
-    def _get_cache_missing_items(
-        self,
-        cache: ProcessingCache,
-        mm_data_items: MultiModalDataItems,
-        mm_hashes: MultiModalHashes,
-    ) -> tuple[dict[str, list[_CacheItemOrHash]], MultiModalDataItems]:
-        mm_cache_items_or_hashes: dict[str, list[_CacheItemOrHash]] = {
-            modality: [(h if (v := cache.get(h)) is None else v)
-                       for h in hashes]
-            for modality, hashes in mm_hashes.items()
-        }
-
-        mm_missing_idxs = {
-            modality: [
-                idx for idx, item_or_hash in enumerate(items_or_hashes)
-                if isinstance(item_or_hash, str)
-            ]
-            for modality, items_or_hashes in mm_cache_items_or_hashes.items()
-        }
-        mm_missing_data = {
-            modality: [mm_data_items[modality][idx] for idx in idxs]
-            for modality, idxs in mm_missing_idxs.items()
-        }
-
-        return mm_cache_items_or_hashes, self._to_mm_items(mm_missing_data)
-
     def _hash_mm_items(
         self,
         mm_items: MultiModalDataItems,
@@ -1401,28 +1371,92 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
             for modality, items in mm_items.items()
         }
 
+    def _get_cache_missing_items(
+        self,
+        cache: "BaseMultiModalProcessorCache",
+        mm_data_items: MultiModalDataItems,
+        mm_hashes: MultiModalHashes,
+    ) -> MultiModalDataItems:
+        mm_is_cached = {
+            modality: cache.is_cached(hashes)
+            for modality, hashes in mm_hashes.items()
+        }
+
+        mm_missing_idxs = {
+            modality: [
+                idx for idx, item_is_cached in enumerate(items_is_cached)
+                if not item_is_cached
+            ]
+            for modality, items_is_cached in mm_is_cached.items()
+        }
+        mm_missing_data = {
+            modality: [mm_data_items[modality][idx] for idx in idxs]
+            for modality, idxs in mm_missing_idxs.items()
+        }
+
+        return self._to_mm_items(mm_missing_data)
+
+    def _recompute_cached_prompt_update(
+        self,
+        cached_update: ResolvedPromptUpdate,
+        new_item_idx: int,
+    ) -> ResolvedPromptUpdate:
+        """
+        Override this if other attributes of `ResolvedPromptUpdate`
+        also need to be recomputed after retrieving from the cache.
+        """
+        return replace(cached_update, item_idx=new_item_idx)
+
     def _merge_mm_kwargs(
         self,
-        cache: ProcessingCache,
-        mm_cache_items_or_hashes: dict[str, list[_CacheItemOrHash]],
+        cache: "BaseMultiModalProcessorCache",
+        mm_hashes: MultiModalHashes,
         mm_missing_kwargs: MultiModalKwargsItems,
-    ) -> MultiModalKwargsItems:
+        mm_missing_prompt_updates: MultiModalPromptUpdates,
+    ) -> tuple[MultiModalKwargsOptionalItems, MultiModalPromptUpdates]:
+        # Need to calculate this at the beginning to avoid skipping cache logic
+        # for subsequently repeated items in the same modality
+        mm_is_cached = {
+            modality: cache.is_cached(hashes)
+            for modality, hashes in mm_hashes.items()
+        }
+
         mm_missing_next_idx = defaultdict[str, int](lambda: 0)
 
-        merged_items = defaultdict[str, list[MultiModalKwargsItem]](list)
-        for modality, items_or_hashes in mm_cache_items_or_hashes.items():
-            for item_or_hash in items_or_hashes:
-                if isinstance(item_or_hash, str):
-                    kw_item = mm_missing_kwargs[modality][
-                        mm_missing_next_idx[modality]]
-                    cache.put(item_or_hash, kw_item)
+        merged_kwargs = defaultdict[str,
+                                    list[Optional[MultiModalKwargsItem]]](list)
+        merged_prompt_updates = defaultdict[
+            str, list[Sequence[ResolvedPromptUpdate]]](list)
+        for modality, hashes in mm_hashes.items():
+            missing_kwargs = mm_missing_kwargs.get(modality, [])
+            missing_prompt_updates = mm_missing_prompt_updates.get(
+                modality, [])
+
+            for item_idx, item_hash in enumerate(hashes):
+                kwargs: Optional[MultiModalKwargsItem]
+                if not mm_is_cached[modality][item_idx]:
+                    missing_next_idx = mm_missing_next_idx[modality]
+                    kwargs = missing_kwargs[missing_next_idx]
+                    updates = missing_prompt_updates[missing_next_idx]
+
                     mm_missing_next_idx[modality] += 1
+
+                    item = kwargs, updates
                 else:
-                    kw_item = item_or_hash
+                    item = None
 
-                merged_items[modality].append(kw_item)
+                kwargs, updates = cache.get_and_update_item(item, item_hash)
 
-        return MultiModalKwargsItems(merged_items)
+                merged_kwargs[modality].append(kwargs)
+                merged_prompt_updates[modality].append([
+                    self._recompute_cached_prompt_update(update, item_idx)
+                    for update in updates
+                ])
+
+        mm_kwargs = MultiModalKwargsItems(merged_kwargs)
+        mm_prompt_updates = dict(merged_prompt_updates)
+
+        return mm_kwargs, mm_prompt_updates
 
     def _apply_hf_processor(
         self,
@@ -1490,10 +1524,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
 
         mm_hashes = self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs,
                                         tokenization_kwargs)
-        (
-            mm_cache_items_or_hashes,
-            mm_missing_data_items,
-        ) = self._get_cache_missing_items(
+
+        mm_missing_data_items = self._get_cache_missing_items(
             cache=cache,
             mm_data_items=mm_data_items,
             mm_hashes=mm_hashes,
@@ -1520,16 +1552,17 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
                                        hf_processor_mm_kwargs),
         )
 
-        mm_kwargs = self._merge_mm_kwargs(
-            cache,
-            mm_cache_items_or_hashes=mm_cache_items_or_hashes,
-            mm_missing_kwargs=mm_missing_kwargs,
+        mm_missing_prompt_updates = self._get_mm_prompt_updates(
+            mm_missing_data_items,
+            hf_processor_mm_kwargs,
+            mm_missing_kwargs,
         )
 
-        mm_prompt_updates = self._get_mm_prompt_updates(
-            mm_data_items,
-            hf_processor_mm_kwargs,
-            mm_kwargs,
+        mm_kwargs, mm_prompt_updates = self._merge_mm_kwargs(
+            cache,
+            mm_hashes=mm_hashes,
+            mm_missing_kwargs=mm_missing_kwargs,
+            mm_missing_prompt_updates=mm_missing_prompt_updates,
         )
 
         mm_info = MultiModalProcessingInfo(
@@ -1614,7 +1647,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
 
     def _validate_mm_kwargs(
         self,
-        mm_kwargs: MultiModalKwargsItems,
+        mm_kwargs: MultiModalKwargsOptionalItems,
         mm_item_counts: Mapping[str, int],
     ) -> None:
         for modality, item_count in mm_item_counts.items():
@@ -1655,7 +1688,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         self,
         mm_items: MultiModalDataItems,
         prompt_ids: list[int],
-        mm_kwargs: MultiModalKwargsItems,
+        mm_kwargs: MultiModalKwargsOptionalItems,
         mm_prompt_updates: MultiModalPromptUpdates,
         is_update_applied: bool,
     ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index ea2efbdd8b524..ffc69a2db60a4 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -13,7 +13,7 @@ import vllm.envs as envs
 from vllm.logger import init_logger
 
 from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
-                     MultiModalInputs, MultiModalKwargsItems,
+                     MultiModalInputs, MultiModalKwargsOptionalItems,
                      MultiModalPlaceholderDict)
 from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
                          EncDecMultiModalProcessor)
@@ -43,7 +43,7 @@ class DummyDecoderData(NamedTuple):
     """Dummy data used for profiling."""
 
     prompt_token_ids: list[int]
-    multi_modal_data: MultiModalKwargsItems
+    multi_modal_data: MultiModalKwargsOptionalItems
     multi_modal_placeholders: MultiModalPlaceholderDict
 
 
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 8cd9e5604872a..38adbf8f3536a 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Mapping
 from dataclasses import dataclass
-from functools import lru_cache
 from typing import TYPE_CHECKING, Generic, Optional, Protocol, TypeVar
 
 import torch.nn as nn
@@ -13,8 +12,9 @@ from vllm.transformers_utils.tokenizer import (AnyTokenizer,
                                                cached_tokenizer_from_config)
 from vllm.utils import ClassRegistry
 
-from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
-                         ProcessingCache)
+from .cache import (BaseMultiModalProcessorCache,
+                    processor_only_cache_from_config)
+from .processing import BaseMultiModalProcessor, BaseProcessingInfo
 from .profiling import (BaseDummyInputsBuilder, DummyDecoderData,
                         DummyEncoderData, MultiModalProfiler)
 
@@ -65,7 +65,7 @@ class MultiModalProcessorFactory(Protocol[_I]):
         info: _I,
         dummy_inputs: BaseDummyInputsBuilder[_I],
         *,
-        cache: Optional[ProcessingCache] = None,
+        cache: Optional[BaseMultiModalProcessorCache] = None,
     ) -> BaseMultiModalProcessor[_I]:
         ...
 
@@ -80,20 +80,13 @@ class _ProcessorFactories(Generic[_I]):
         self,
         ctx: InputProcessingContext,
         *,
-        cache: Optional[ProcessingCache] = None,
+        cache: Optional[BaseMultiModalProcessorCache] = None,
     ):
         info = self.info(ctx)
         dummy_inputs_builder = self.dummy_inputs(info)
         return self.processor(info, dummy_inputs_builder, cache=cache)
 
 
-# Make sure a different cache is used for each model config
-# NOTE: ModelConfig is not hashable so it cannot be passed directly
-@lru_cache(maxsize=1)
-def _get_processor_cache(model_id: str, capacity_gb: int):
-    return ProcessingCache(capacity_gb) if capacity_gb > 0 else None
-
-
 class MultiModalRegistry:
     """
     A registry that dispatches data processing according to the model.
@@ -103,31 +96,6 @@ class MultiModalRegistry:
         self._processor_factories = ClassRegistry[nn.Module,
                                                   _ProcessorFactories]()
 
-    def _get_processor_cache(self, model_config: "ModelConfig"):
-        model_id = model_config.model
-        capacity_gb = model_config.mm_processor_cache_gb
-        return _get_processor_cache(model_id, capacity_gb)
-
-    def reset_processor_cache(self, model_config: "ModelConfig") -> bool:
-        """Reset the multi-modal processing cache."""
-        if processor_cache := self._get_processor_cache(model_config):
-            processor_cache.reset()
-
-        return True  # Success
-
-    def enable_mm_input_cache(self, model_config: "ModelConfig") -> bool:
-        """Whether the multi-modal input cache should be enabled.
-        NOTE: This is put under MultiModalRegistry on purpose to respect 
-        text-only mode for multimodal models.
-        """
-
-        if not self.supports_multimodal_inputs(model_config):
-            return False
-
-        mm_config = model_config.get_multimodal_config()
-
-        return mm_config.mm_processor_cache_gb > 0
-
     def supports_multimodal_inputs(self, model_config: "ModelConfig") -> bool:
         """
         Checks if the model supports multimodal inputs.
@@ -157,6 +125,8 @@ class MultiModalRegistry:
     def get_max_tokens_per_item_by_modality(
         self,
         model_config: "ModelConfig",
+        *,
+        cache: Optional[BaseMultiModalProcessorCache] = None,
     ) -> Mapping[str, int]:
         """
         Get the maximum number of tokens per data item from each modality based
@@ -165,11 +135,11 @@ class MultiModalRegistry:
         if not model_config.is_multimodal_model:
             return {}
 
-        processor = self.create_processor(model_config, disable_cache=False)
+        processor = self.create_processor(model_config, cache=cache)
         profiler = MultiModalProfiler(processor)
 
         seq_len = model_config.max_model_len
-        mm_limits = self.get_mm_limits_per_prompt(model_config)
+        mm_limits = self.get_mm_limits_per_prompt(model_config, cache=cache)
 
         return profiler.get_mm_max_contiguous_tokens(
             seq_len,
@@ -182,6 +152,8 @@ class MultiModalRegistry:
     def get_max_tokens_per_item_by_nonzero_modality(
         self,
         model_config: "ModelConfig",
+        *,
+        cache: Optional[BaseMultiModalProcessorCache] = None,
     ) -> Mapping[str, int]:
         """
         Get the maximum number of tokens per data item from each modality based
@@ -192,15 +164,19 @@ class MultiModalRegistry:
             This is currently directly used only in V1 for profiling the memory
             usage of a model.
         """
-        mm_limits = self.get_mm_limits_per_prompt(model_config)
+        mm_limits = self.get_mm_limits_per_prompt(model_config, cache=cache)
+        max_tokens_per_item = self.get_max_tokens_per_item_by_modality(
+            model_config,
+            cache=cache,
+        )
 
         return {
             key: max_tokens_per_mm_item
-            for key, max_tokens_per_mm_item in
-            self.get_max_tokens_per_item_by_modality(model_config).items()
+            for key, max_tokens_per_mm_item in max_tokens_per_item.items()
             if mm_limits[key] > 0
         }
 
+    # TODO: Remove once V0 is gone
     def get_max_tokens_by_modality(
         self,
         model_config: "ModelConfig",
@@ -209,14 +185,19 @@ class MultiModalRegistry:
         Get the maximum number of tokens from each modality
         for profiling the memory usage of a model.
         """
-        mm_limits = self.get_mm_limits_per_prompt(model_config)
+        cache = processor_only_cache_from_config(model_config, self)
+        mm_limits = self.get_mm_limits_per_prompt(model_config, cache=cache)
+        max_tokens_per_item = self.get_max_tokens_per_item_by_modality(
+            model_config,
+            cache=cache,
+        )
 
         return {
             key: mm_limits[key] * max_tokens_per_mm_item
-            for key, max_tokens_per_mm_item in
-            self.get_max_tokens_per_item_by_modality(model_config).items()
+            for key, max_tokens_per_mm_item in max_tokens_per_item.items()
         }
 
+    # TODO: Remove once V0 is gone
     def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
         """
         Get the maximum number of multi-modal tokens
@@ -227,6 +208,8 @@ class MultiModalRegistry:
     def get_mm_limits_per_prompt(
         self,
         model_config: "ModelConfig",
+        *,
+        cache: Optional[BaseMultiModalProcessorCache] = None,
     ) -> Mapping[str, int]:
         """
         Get the maximum number of multi-modal input instances for each modality
@@ -235,7 +218,7 @@ class MultiModalRegistry:
         if not model_config.is_multimodal_model:
             return {}
 
-        processor = self.create_processor(model_config, disable_cache=False)
+        processor = self.create_processor(model_config, cache=cache)
         profiler = MultiModalProfiler(processor)
         return profiler.get_mm_limits()
 
@@ -303,7 +286,7 @@ class MultiModalRegistry:
         model_config: "ModelConfig",
         *,
         tokenizer: Optional[AnyTokenizer] = None,
-        disable_cache: Optional[bool] = None,
+        cache: Optional[BaseMultiModalProcessorCache] = None,
     ) -> BaseMultiModalProcessor[BaseProcessingInfo]:
         """
         Create a multi-modal processor for a specific model and tokenizer.
@@ -311,15 +294,10 @@ class MultiModalRegistry:
         if not model_config.is_multimodal_model:
             raise ValueError(f"{model_config.model} is not a multimodal model")
 
-        if disable_cache is None:
-            disable_cache = not model_config.enable_mm_processor_cache
-
         model_cls = self._get_model_cls(model_config)
         factories = self._processor_factories[model_cls]
 
         ctx = self._create_processing_ctx(model_config, tokenizer)
-        cache = None if disable_cache else self._get_processor_cache(
-            model_config)
 
         return factories.build_processor(ctx, cache=cache)
 
@@ -328,13 +306,15 @@ class MultiModalRegistry:
         model_config: "ModelConfig",
         seq_len: int,
         mm_counts: Optional[Mapping[str, int]] = None,
+        *,
+        cache: Optional[BaseMultiModalProcessorCache] = None,
     ) -> DummyDecoderData:
         """
         Create dummy data for profiling the memory usage of a model.
 
         The model is identified by ``model_config``.
         """
-        processor = self.create_processor(model_config, disable_cache=False)
+        processor = self.create_processor(model_config, cache=cache)
         profiler = MultiModalProfiler(processor)
         dummy_data = profiler.get_decoder_dummy_data(seq_len, mm_counts)
 
@@ -352,13 +332,15 @@ class MultiModalRegistry:
         model_config: "ModelConfig",
         seq_len: int,
         mm_counts: Optional[Mapping[str, int]] = None,
+        *,
+        cache: Optional[BaseMultiModalProcessorCache] = None,
     ) -> DummyEncoderData:
         """
         Create dummy data for profiling the memory usage of a model.
 
         The model is identified by ``model_config``.
         """
-        processor = self.create_processor(model_config, disable_cache=False)
+        processor = self.create_processor(model_config, cache=cache)
         profiler = MultiModalProfiler(processor)
         dummy_data = profiler.get_encoder_dummy_data(seq_len, mm_counts)
 
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 342d7b24f8e98..dbea0b610b31a 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -597,8 +597,7 @@ class AsyncLLM(EngineClient):
         await asyncio.gather(*coros)
 
     async def reset_mm_cache(self) -> None:
-        self.processor.mm_registry.reset_processor_cache(self.model_config)
-        self.processor.mm_input_cache_client.reset()
+        self.processor.clear_cache()
         await self.engine_core.reset_mm_cache_async()
 
     async def reset_prefix_cache(self,
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 32765cda6482f..b614828061846 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -22,6 +22,7 @@ from vllm.logger import init_logger
 from vllm.logging_utils.dump_input import dump_engine_exception
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.cache import receiver_cache_from_config
 from vllm.tasks import POOLING_TASKS, SupportedTask
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
@@ -38,7 +39,6 @@ from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
                             EngineCoreRequestType,
                             ReconfigureDistributedRequest, ReconfigureRankType,
                             UtilityOutput, UtilityResult)
-from vllm.v1.engine.mm_input_cache import MultiModalInputCacheServer
 from vllm.v1.engine.utils import EngineHandshakeMetadata, EngineZmqAddresses
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.kv_cache_interface import KVCacheConfig
@@ -128,8 +128,9 @@ class EngineCore:
         )
         self.use_spec_decode = vllm_config.speculative_config is not None
 
-        self.mm_input_cache_server = MultiModalInputCacheServer(
-            vllm_config.model_config, MULTIMODAL_REGISTRY)
+        self.mm_registry = mm_registry = MULTIMODAL_REGISTRY
+        self.mm_receiver_cache = receiver_cache_from_config(
+            vllm_config, mm_registry)
 
         # Setup batch queue for pipeline parallelism.
         # Batch queue for scheduled batches. This enables us to asynchronously
@@ -370,7 +371,8 @@ class EngineCore:
             logger.warning("Resetting the multi-modal cache when requests are "
                            "in progress may lead to desynced internal caches.")
 
-        self.mm_input_cache_server.reset()
+        if self.mm_receiver_cache is not None:
+            self.mm_receiver_cache.clear_cache()
 
     def reset_prefix_cache(self):
         self.scheduler.reset_prefix_cache()
@@ -435,10 +437,11 @@ class EngineCore:
             assert request.mm_kwargs is not None
 
             # Note on thread safety: no race condition.
-            # `mm_input_cache_server` is reset at the end of LLMEngine init,
+            # `mm_receiver_cache` is reset at the end of LLMEngine init,
             # and will only accessed in the input processing thread afterwards.
-            request.mm_kwargs = self.mm_input_cache_server.get_and_update(
-                request.mm_kwargs, request.mm_hashes)
+            if self.mm_receiver_cache is not None:
+                request.mm_kwargs = self.mm_receiver_cache.get_and_update(
+                    request.mm_kwargs, request.mm_hashes)
 
         req = Request.from_engine_core_request(request,
                                                self.request_block_hasher)
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 5a00a930951cc..7130f666ef19f 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -271,8 +271,7 @@ class LLMEngine:
         self.engine_core.profile(False)
 
     def reset_mm_cache(self):
-        self.processor.mm_registry.reset_processor_cache(self.model_config)
-        self.processor.mm_input_cache_client.reset()
+        self.processor.clear_cache()
         self.engine_core.reset_mm_cache()
 
     def reset_prefix_cache(self, device: Optional[Device] = None):
diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py
deleted file mode 100644
index aa7dc62fd4acb..0000000000000
--- a/vllm/v1/engine/mm_input_cache.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Sequence
-from typing import TYPE_CHECKING, Optional
-
-from vllm.multimodal import MultiModalRegistry
-from vllm.multimodal.cache import MultiModalCache, MultiModalCacheItemMetadata
-from vllm.multimodal.inputs import MultiModalKwargsItem
-from vllm.utils import is_list_of
-
-if TYPE_CHECKING:
-    from vllm.config import ModelConfig
-
-# The idea of multimodal input caching is based on having a client and
-# a server, where the client executes in the frontend process (=P0) and the
-# server in the core process (=P1).
-#
-# -- P0:
-#  - BaseMultiModalProcessor calls MultiModalHasher to get the `mm_hash` of
-#    each input multi-modal item (e.g. image),
-#  - BaseMultiModalProcessor processes the input items into `mm_kwargs`,
-#    which are MultiModalKwargsItem instances that each correspond to an
-#    input multi-modal item.
-#  - MultiModalInputCacheClient accepts the `mm_kwargs` and corresponding
-#    `mm_hash` for each item. It stores the `mm_hash` as keys and the size
-#    of `mm_kwargs`, but not the `mm_kwargs` themselves, to avoid taking
-#    up additional memory in P0.
-#  - The `mm_hash` is always sent to P1.
-#  - The corresponding `mm_kwargs` are only sent to P1 if they are not cached
-#    in MultiModalInputCacheServer.
-#
-# -- P1:
-#  - If the `mm_hash` is cached (i.e. `mm_kwargs` are not sent from P0),
-#    MultiModalInputCacheServer retrieves the corresponding `mm_kwargs`.
-#  - If the `mm_hash` is not cached (i.e. `mm_kwargs` are sent from P0),
-#    MultiModalInputCacheServer stores `mm_kwargs` under the key `mm_hash`.
-#  - Either way, the `mm_hash` and corresponding `mm_kwargs` are sent to
-#    the engine for model execution.
-#
-# Both Client and Server must perform cache update and eviction based on the
-# same item size. This ensures that the keys of MultiModalInputCacheClient
-# and MultiModalInputCacheServer are mirrored, allowing us to determine in P0
-# whether a key is cached in MultiModalInputCacheServer by querying
-# MultiModalInputCacheClient without having to communicate with P1.
-
-
-class MultiModalInputCacheClient:
-    """Used by P0 to check whether multi-modal kwargs are cached in P1."""
-
-    def __init__(self, model_config: "ModelConfig",
-                 mm_registry: MultiModalRegistry) -> None:
-        super().__init__()
-
-        self.enabled = mm_registry.enable_mm_input_cache(model_config)
-        self.mm_cache = MultiModalCache.get_lru_cache(
-            model_config.get_mm_input_cache_gb(),
-            MultiModalCacheItemMetadata,
-        )
-
-    def get_and_update(
-        self,
-        mm_kwargs: Sequence[MultiModalKwargsItem],
-        mm_hashes: list[str],
-    ) -> list[Optional[MultiModalKwargsItem]]:
-        if not self.enabled:
-            return list(mm_kwargs)
-
-        assert len(mm_kwargs) == len(mm_hashes)
-
-        out_mm_items = list[Optional[MultiModalKwargsItem]]()
-        for mm_item, mm_hash in zip(mm_kwargs, mm_hashes):
-            if self.mm_cache.get(mm_hash) is not None:
-                out_mm_items.append(None)
-            else:
-                self.mm_cache[mm_hash] = \
-                    MultiModalCacheItemMetadata.wraps(mm_item)
-                out_mm_items.append(mm_item)
-
-        return out_mm_items
-
-    def reset(self) -> None:
-        self.mm_cache.clear()
-
-
-class MultiModalInputCacheServer:
-    """Used by P1 to avoid requiring past multi-modal kwargs from P0."""
-
-    def __init__(self, model_config: "ModelConfig",
-                 mm_registry: MultiModalRegistry) -> None:
-        super().__init__()
-
-        self.enabled = mm_registry.enable_mm_input_cache(model_config)
-        self.mm_cache = MultiModalCache.get_lru_cache(
-            model_config.get_mm_input_cache_gb(),
-            MultiModalKwargsItem,
-        )
-
-    def get_and_update(
-        self,
-        mm_kwargs: Sequence[Optional[MultiModalKwargsItem]],
-        mm_hashes: list[str],
-    ) -> list[MultiModalKwargsItem]:
-        if not self.enabled:
-            mm_kwargs_lst = list(mm_kwargs)
-            assert is_list_of(mm_kwargs_lst, MultiModalKwargsItem)
-            return mm_kwargs_lst
-
-        assert len(mm_kwargs) == len(mm_hashes)
-
-        out_mm_items = list[MultiModalKwargsItem]()
-        for mm_item, mm_hash in zip(mm_kwargs, mm_hashes):
-            if mm_item is None:
-                out_mm_items.append(self.mm_cache[mm_hash])
-            else:
-                self.mm_cache[mm_hash] = mm_item
-                out_mm_items.append(mm_item)
-
-        return out_mm_items
-
-    def reset(self) -> None:
-        self.mm_cache.clear()
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 300b0713b2ffe..7ed60156626bf 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -11,6 +11,7 @@ from vllm.inputs.parse import split_enc_dec_inputs
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.multimodal.cache import processor_cache_from_config
 from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
 from vllm.multimodal.processing import EncDecMultiModalProcessor
 from vllm.multimodal.utils import argsort_mm_positions
@@ -18,7 +19,6 @@ from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.mm_input_cache import MultiModalInputCacheClient
 from vllm.v1.structured_output.backend_guidance import (
     validate_guidance_grammar)
 from vllm.v1.structured_output.backend_lm_format_enforcer import (
@@ -47,16 +47,17 @@ class Processor:
 
         self.generation_config_fields = (
             self.model_config.try_get_generation_config())
-        self.input_preprocessor = InputPreprocessor(self.model_config,
-                                                    self.tokenizer,
-                                                    mm_registry)
 
-        self.mm_input_cache_client = MultiModalInputCacheClient(
-            self.model_config, mm_registry)
+        self.mm_registry = mm_registry
+        self.mm_processor_cache = processor_cache_from_config(
+            vllm_config, mm_registry)
 
-    @property
-    def mm_registry(self):
-        return self.input_preprocessor.mm_registry
+        self.input_preprocessor = InputPreprocessor(
+            self.model_config,
+            self.tokenizer,
+            mm_registry,
+            mm_processor_cache=self.mm_processor_cache,
+        )
 
     def _validate_logprobs(
         self,
@@ -310,7 +311,7 @@ class Processor:
             # in the input sequence.
             sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions)
 
-            orig_sorted_mm_inputs = [
+            sorted_mm_inputs = [
                 decoder_mm_inputs[modality][idx]
                 for modality, idx in sorted_mm_idxs
             ]
@@ -323,11 +324,6 @@ class Processor:
                 for modality, idx in sorted_mm_idxs
             ]
 
-            sorted_mm_inputs = self.mm_input_cache_client.get_and_update(
-                orig_sorted_mm_inputs,
-                sorted_mm_hashes,
-            )
-
         return decoder_inputs.get("prompt"), EngineCoreRequest(
             request_id=request_id,
             prompt_token_ids=decoder_inputs["prompt_token_ids"],
@@ -415,3 +411,6 @@ class Processor:
             # TODO: Find out how many placeholder tokens are there so we can
             # check that chunked prefill does not truncate them
             # max_batch_len = self.scheduler_config.max_num_batched_tokens
+
+    def clear_cache(self) -> None:
+        self.input_preprocessor.clear_cache()
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index f1ceaaae62a70..053aaf4f968e0 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2186,10 +2186,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         max_items_per_batch: int,
     ) -> BatchedTensorInputs:
         """Dummy data for profiling and precompiling multimodal models."""
+        assert self.mm_budget is not None
+
         dummy_decoder_data = self.mm_registry.get_decoder_dummy_data(
             model_config=self.model_config,
             seq_len=self.max_num_tokens,
             mm_counts={modality: 1},
+            cache=self.mm_budget.cache,
         )
         dummy_mm_data = dummy_decoder_data.multi_modal_data
 
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 4a485b7e077d4..d364236604274 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -1813,10 +1813,13 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         max_items_per_batch: int,
     ) -> BatchedTensorInputs:
         """Dummy data for profiling and precompiling multimodal models."""
+        assert self.mm_budget is not None
+
         dummy_decoder_data = self.mm_registry.get_decoder_dummy_data(
             model_config=self.model_config,
             seq_len=self.max_num_tokens,
             mm_counts={modality: 1},
+            cache=self.mm_budget.cache,
         )
         dummy_mm_data = dummy_decoder_data.multi_modal_data
 
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index b96473e7b1645..82ede5ad8eb1e 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -10,6 +10,7 @@ from vllm.attention.backends.abstract import AttentionBackend
 from vllm.config import ModelConfig, SchedulerConfig
 from vllm.model_executor.models.interfaces import MultiModalEmbeddings
 from vllm.model_executor.models.utils import extract_layer_index
+from vllm.multimodal.cache import processor_only_cache_from_config
 from vllm.multimodal.registry import MultiModalRegistry
 from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
 from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget
@@ -33,14 +34,18 @@ class MultiModalBudget:
         self.model_config = model_config
         self.scheduler_config = scheduler_config
         self.mm_registry = mm_registry
+        self.cache = cache = processor_only_cache_from_config(
+            model_config, mm_registry)
 
         self.max_model_len = model_config.max_model_len
         self.max_num_reqs = scheduler_config.max_num_seqs
 
-        self.mm_limits = mm_registry.get_mm_limits_per_prompt(model_config)
+        self.mm_limits = mm_registry.get_mm_limits_per_prompt(model_config,
+                                                              cache=cache)
 
         max_tokens_by_modality = mm_registry \
-            .get_max_tokens_per_item_by_nonzero_modality(model_config)
+            .get_max_tokens_per_item_by_nonzero_modality(model_config,
+                                                         cache=cache)
 
         encoder_compute_budget, encoder_cache_size = compute_mm_encoder_budget(
             scheduler_config,

From 64466778397482e0cb9ff9f6b320ca6d9dc567ae Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Wed, 27 Aug 2025 15:27:14 +0800
Subject: [PATCH 074/112] [XPU]fix cuda event used in XPU model runner (#23708)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
---
 vllm/v1/worker/xpu_model_runner.py | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/worker/xpu_model_runner.py b/vllm/v1/worker/xpu_model_runner.py
index 59f8d0fcf5bd9..fb892211f19db 100644
--- a/vllm/v1/worker/xpu_model_runner.py
+++ b/vllm/v1/worker/xpu_model_runner.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from contextlib import contextmanager
 from typing import TYPE_CHECKING
 
 import torch
@@ -22,7 +23,8 @@ class XPUModelRunner(GPUModelRunner):
         vllm_config: VllmConfig,
         device: torch.device,
     ):
-        super().__init__(vllm_config, device)
+        with _torch_cuda_wrapper():
+            super().__init__(vllm_config, device)
         # FIXME: To be verified.
         self.cascade_attn_enabled = False
 
@@ -31,3 +33,21 @@ class XPUModelRunner(GPUModelRunner):
 
     def _sync_device(self) -> None:
         torch.xpu.synchronize()
+
+
+@contextmanager
+def _torch_cuda_wrapper():
+
+    class _EventPlaceholder:
+
+        def __init__(self, *args, **kwargs) -> None:
+            self.record = lambda: None
+            self.synchronize = lambda: None
+
+    try:
+        # replace cuda Event with xpu Event, this should work by default
+        torch.cuda.Event = torch.xpu.Event
+        yield
+    finally:
+        # if anything goes wrong, just patch it with a placeholder
+        torch.cuda.Event = _EventPlaceholder

From 91e382c935c2905c29f3ca22c658e03e8f02deaa Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 27 Aug 2025 16:11:15 +0800
Subject: [PATCH 075/112] [CI/Build] Remove redundant register in model init
 tests (#23715)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/models/test_initialization.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index bbd3da982af84..b4d516233b4bf 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -38,11 +38,6 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
                               model_arch=model_arch,
                               exist_overrides=model_info.hf_overrides)
 
-    if model_arch in ("Llama4ForCausalLM", "EagleLlama4ForCausalLM"):
-        from vllm.model_executor.models.llama4 import Llama4ForCausalLM
-        from vllm.model_executor.models.registry import ModelRegistry
-        ModelRegistry.register_model("Llama4ForCausalLM", Llama4ForCausalLM)
-
     # Avoid calling model.forward()
     def _initialize_kv_caches_v0(self) -> None:
         self.cache_config.num_gpu_blocks = 0

From 5bd9f841581a3a9e9eecdd8764240575bb28e391 Mon Sep 17 00:00:00 2001
From: Michael Yao <haifeng.yao@daocloud.io>
Date: Wed, 27 Aug 2025 17:50:09 +0800
Subject: [PATCH 076/112] [Docs] Fix an admonition important (#23726)

Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
---
 docs/configuration/optimization.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
index 3eaf2185a559e..a8eab9985c8b9 100644
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -164,7 +164,7 @@ llm = LLM(
 )
 ```
 
-!! important
+!!! important
     Batch-level DP is not to be confused with API request-level DP
     (which is instead controlled by `data_parallel_size`).
 

From 6578e873655859462758c5c51e51f876f2aa24a3 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 27 Aug 2025 02:52:45 -0700
Subject: [PATCH 077/112] Optimize input preparation for FlashInfer [2/N]
 (#23174)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/attention/backends/flashinfer.py | 82 ++++++++++++++++--------
 1 file changed, 55 insertions(+), 27 deletions(-)

diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 941d2a4d7f1ac..f948157c2b575 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -6,6 +6,7 @@ from __future__ import annotations
 from dataclasses import dataclass
 from typing import ClassVar, Optional, Union
 
+import numpy as np
 import torch
 from flashinfer import (BatchDecodeWithPagedKVCacheWrapper,
                         BatchPrefillWithPagedKVCacheWrapper,
@@ -22,6 +23,7 @@ from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     QuantKey, kFp8StaticTensorSym, kNvfp4Quant)
 from vllm.platforms import current_platform
+from vllm.triton_utils import tl, triton
 from vllm.utils import cdiv, is_pin_memory_available
 from vllm.utils.flashinfer import (supports_trtllm_attention,
                                    use_trtllm_attention)
@@ -230,6 +232,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                                                dtype=torch.int32,
                                                device="cpu",
                                                pin_memory=pin_memory)
+        self.paged_kv_indptr_np = self.paged_kv_indptr_cpu.numpy()
         self.paged_kv_indices_cpu = torch.zeros(max_num_pages,
                                                 dtype=torch.int32,
                                                 device="cpu",
@@ -238,10 +241,8 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                                                       dtype=torch.int32,
                                                       device="cpu",
                                                       pin_memory=pin_memory)
-
-        self.block_table_arange = torch.arange(max_num_pages_per_req,
-                                               dtype=torch.int32,
-                                               device=self.device)
+        self.paged_kv_last_page_len_np = (
+            self.paged_kv_last_page_len_cpu.numpy())
 
     def _get_workspace_buffer(self):
         if self._workspace_buffer is None:
@@ -317,9 +318,10 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         max_seq_len = common_attn_metadata.max_seq_len
         seq_lens = common_attn_metadata.seq_lens
         seq_lens_cpu = common_attn_metadata.seq_lens_cpu
+        seq_lens_np = seq_lens_cpu.numpy()
         block_table_tensor = common_attn_metadata.block_table_tensor
 
-        block_table_bounds_cpu = (seq_lens_cpu + page_size - 1) // page_size
+        num_blocks_np = (seq_lens_np + (page_size - 1)) // page_size
 
         use_cascade = common_prefix_len > 0
         if use_cascade:
@@ -342,37 +344,41 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
 
             # Remove the blocks of the shared prefix from all requests.
             block_table_tensor = block_table_tensor[:, num_common_kv_blocks:]
-            block_table_bounds_cpu -= num_common_kv_blocks
+            num_blocks_np -= num_common_kv_blocks
         else:
             shared_qo_indptr_cpu = None
             shared_kv_page_indptr_cpu = None
             shared_kv_page_indices_cpu = None
             shared_kv_last_page_len_cpu = None
 
-        max_num_blocks = block_table_bounds_cpu.max().item()
-        block_table_bounds = block_table_bounds_cpu.to(self.device,
-                                                       non_blocking=True)
-        mask = (self.block_table_arange[:max_num_blocks].unsqueeze(0)
-                < block_table_bounds.unsqueeze(1))
-        # write self.paged_kv_indices inplace
-        num_actual_pages = torch.sum(mask)
-        paged_kv_indices = self.paged_kv_indices[:num_actual_pages]
-        torch.masked_select(block_table_tensor[:, :max_num_blocks],
-                            mask,
-                            out=paged_kv_indices)
-
         # write self.paged_kv_indptr_cpu inplace (0-index is always 0)
-        torch.cumsum(block_table_bounds_cpu,
-                     dim=0,
-                     dtype=torch.int32,
-                     out=self.paged_kv_indptr_cpu[1:1 + num_reqs])
+        np.cumsum(
+            num_blocks_np,
+            dtype=np.int32,
+            out=self.paged_kv_indptr_np[1:num_reqs + 1],
+        )
+        paged_kv_indptr = self.paged_kv_indptr[:num_reqs + 1]
+        paged_kv_indptr.copy_(self.paged_kv_indptr_cpu[:num_reqs + 1],
+                              non_blocking=True)
+
+        # write self.paged_kv_indices inplace
+        num_actual_pages = num_blocks_np.sum().item()
+        paged_kv_indices = self.paged_kv_indices[:num_actual_pages]
+        _copy_page_indices_kernel[(num_reqs, )](
+            paged_kv_indices,
+            block_table_tensor,
+            block_table_tensor.stride(0),
+            paged_kv_indptr,
+            BLOCK_SIZE=1024,
+        )
 
-        paged_kv_last_page_len_cpu = seq_lens_cpu % page_size
         # write self.paged_kv_last_page_len_cpu inplace
-        torch.where(paged_kv_last_page_len_cpu == 0,
-                    torch.tensor(page_size),
-                    paged_kv_last_page_len_cpu,
-                    out=self.paged_kv_last_page_len_cpu[:num_reqs])
+        paged_kv_last_page_len_np = seq_lens_np % page_size
+        self.paged_kv_last_page_len_np[:num_reqs] = np.where(
+            paged_kv_last_page_len_np == 0,
+            page_size,
+            paged_kv_last_page_len_np,
+        )
 
         # Check if any layer uses sinks (requires TRTLLM attention)
         has_sinks = self.global_hyperparameters.has_sinks
@@ -1002,3 +1008,25 @@ def fast_plan_decode(
     self._sm_scale = sm_scale
     self._rope_scale = rope_scale
     self._rope_theta = rope_theta
+
+
+@triton.jit
+def _copy_page_indices_kernel(
+    page_indices,
+    block_table,
+    block_table_stride,
+    cu_num_blocks,
+    BLOCK_SIZE: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    row_ptr = block_table + req_idx * block_table_stride
+    start_idx = tl.load(cu_num_blocks + req_idx)
+    end_idx = tl.load(cu_num_blocks + req_idx + 1)
+    num_blocks = end_idx - start_idx
+
+    offset = tl.arange(0, BLOCK_SIZE)
+    for i in tl.range(0, num_blocks, BLOCK_SIZE):
+        block_ids = tl.load(row_ptr + i + offset, mask=i + offset < num_blocks)
+        tl.store(page_indices + start_idx + i + offset,
+                 block_ids,
+                 mask=i + offset < num_blocks)

From 04ff1e43fb6e2e675170d0c90399290f8925abb7 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 27 Aug 2025 03:25:00 -0700
Subject: [PATCH 078/112] [Misc] Move CpuGpuBuffer to vllm/v1/utils.py (#23728)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/utils.py                   | 29 +++++++++++++++++++++++++++++
 vllm/v1/worker/cpu_model_runner.py |  2 +-
 vllm/v1/worker/gpu_model_runner.py |  6 +++---
 vllm/v1/worker/utils.py            | 29 -----------------------------
 4 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index b5750c82db023..8f9face6fbf2e 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -96,6 +96,35 @@ class ConstantList(Generic[T], Sequence):
         return f"ConstantList({self._x})"
 
 
+class CpuGpuBuffer:
+
+    def __init__(
+        self,
+        *args,
+        dtype: torch.dtype,
+        device: torch.device,
+        pin_memory: bool,
+    ):
+        self.cpu = torch.zeros(*args,
+                               dtype=dtype,
+                               device="cpu",
+                               pin_memory=pin_memory)
+        self.np = self.cpu.numpy()
+        self.gpu = self.cpu.to(device)
+
+    def copy_to_gpu(self, n: Optional[int] = None) -> torch.Tensor:
+        if n is None:
+            return self.gpu.copy_(self.cpu, non_blocking=True)
+        return self.gpu[:n].copy_(self.cpu[:n], non_blocking=True)
+
+    def copy_to_cpu(self, n: Optional[int] = None) -> torch.Tensor:
+        """NOTE: Because this method is non-blocking, explicit synchronization
+        is needed to ensure the data is copied to CPU."""
+        if n is None:
+            return self.cpu.copy_(self.gpu, non_blocking=True)
+        return self.cpu[:n].copy_(self.gpu[:n], non_blocking=True)
+
+
 def get_engine_client_zmq_addr(local_only: bool,
                                host: str,
                                port: int = 0) -> str:
diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py
index 137578f0e6088..742e553b77e09 100644
--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -10,8 +10,8 @@ from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
 from vllm.v1.attention.backends.cpu_attn import TorchSDPAMetadataBuilderV1
+from vllm.v1.utils import CpuGpuBuffer
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
-from vllm.v1.worker.utils import CpuGpuBuffer
 
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 053aaf4f968e0..d93460d618e7c 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -78,14 +78,14 @@ from vllm.v1.spec_decode.eagle import EagleProposer
 from vllm.v1.spec_decode.medusa import MedusaProposer
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
+from vllm.v1.utils import CpuGpuBuffer
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.kv_connector_model_runner_mixin import (
     KVConnectorModelRunnerMixin, KVConnectorOutput)
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
-from .utils import (AttentionGroup, CpuGpuBuffer, MultiModalBudget,
-                    bind_kv_cache, gather_mm_placeholders,
-                    initialize_kv_cache_for_kv_sharing,
+from .utils import (AttentionGroup, MultiModalBudget, bind_kv_cache,
+                    gather_mm_placeholders, initialize_kv_cache_for_kv_sharing,
                     sanity_check_mm_encoder_outputs, scatter_mm_placeholders)
 
 if TYPE_CHECKING:
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 82ede5ad8eb1e..f407534687662 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -303,32 +303,3 @@ def bind_kv_cache(
     for layer_name, kv_cache in kv_caches.items():
         # NOTE: Use list because of v0 PP virtual engine.
         forward_context[layer_name].kv_cache = [kv_cache]
-
-
-class CpuGpuBuffer:
-
-    def __init__(
-        self,
-        *args,
-        dtype: torch.dtype,
-        device: torch.device,
-        pin_memory: bool,
-    ):
-        self.cpu = torch.zeros(*args,
-                               dtype=dtype,
-                               device="cpu",
-                               pin_memory=pin_memory)
-        self.np = self.cpu.numpy()
-        self.gpu = self.cpu.to(device)
-
-    def copy_to_gpu(self, n: Optional[int] = None) -> None:
-        if n is None:
-            return self.gpu.copy_(self.cpu, non_blocking=True)
-        return self.gpu[:n].copy_(self.cpu[:n], non_blocking=True)
-
-    def copy_to_cpu(self, n: Optional[int] = None) -> None:
-        """NOTE: Because this method is non-blocking, explicit synchronization
-        is needed to ensure the data is copied to CPU."""
-        if n is None:
-            return self.cpu.copy_(self.gpu, non_blocking=True)
-        return self.cpu[:n].copy_(self.gpu[:n], non_blocking=True)

From 11eddf02f0234f79435d747f2d3dce117ab39aa1 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 27 Aug 2025 03:45:04 -0700
Subject: [PATCH 079/112] [FlashInfer] Cache hyper params in metadata builder
 (#23732)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/attention/backends/flashinfer.py | 30 ++++++++++++------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index f948157c2b575..1115fc606b055 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -214,6 +214,10 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         # TODO: discard this for trtllm-gen backend
         self.global_hyperparameters = infer_global_hyperparameters(
             get_per_layer_parameters(vllm_config, layer_names, FlashInferImpl))
+        self.sm_scale = self.global_hyperparameters.sm_scale
+        self.window_left = self.global_hyperparameters.window_left
+        self.logits_soft_cap = self.global_hyperparameters.logits_soft_cap
+        self.has_sinks = self.global_hyperparameters.has_sinks
 
         # Preparing persistent buffers (device-side)
         self.paged_kv_indptr = torch.zeros(max_num_reqs + 1,
@@ -381,8 +385,6 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         )
 
         # Check if any layer uses sinks (requires TRTLLM attention)
-        has_sinks = self.global_hyperparameters.has_sinks
-
         prefill_use_trtllm = use_trtllm_attention(self.num_qo_heads,
                                                   self.num_kv_heads,
                                                   num_prefill_tokens,
@@ -390,7 +392,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                                                   self.cache_dtype,
                                                   self.q_data_type,
                                                   is_prefill=True,
-                                                  has_sinks=has_sinks)
+                                                  has_sinks=self.has_sinks)
         decode_use_trtllm = use_trtllm_attention(self.num_qo_heads,
                                                  self.num_kv_heads,
                                                  num_decode_tokens,
@@ -398,7 +400,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                                                  self.cache_dtype,
                                                  self.q_data_type,
                                                  is_prefill=False,
-                                                 has_sinks=has_sinks)
+                                                 has_sinks=self.has_sinks)
 
         attn_metadata = FlashInferMetadata(
             num_actual_tokens=num_actual_tokens,
@@ -433,9 +435,9 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                 self.head_dim,
                 self.page_size,
                 causal=True,
-                sm_scale=self.global_hyperparameters.sm_scale,
-                window_left=self.global_hyperparameters.window_left,
-                logits_soft_cap=self.global_hyperparameters.logits_soft_cap,
+                sm_scale=self.sm_scale,
+                window_left=self.window_left,
+                logits_soft_cap=self.logits_soft_cap,
                 q_data_type=self.q_data_type,
                 kv_data_type=self.kv_cache_dtype,
             )
@@ -472,10 +474,9 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                         self.head_dim,
                         self.page_size,
                         causal=True,
-                        sm_scale=self.global_hyperparameters.sm_scale,
-                        window_left=self.global_hyperparameters.window_left,
-                        logits_soft_cap=self.global_hyperparameters.
-                        logits_soft_cap,
+                        sm_scale=self.sm_scale,
+                        window_left=self.window_left,
+                        logits_soft_cap=self.logits_soft_cap,
                         q_data_type=self.q_data_type,
                         kv_data_type=self.kv_cache_dtype,
                     )
@@ -525,10 +526,9 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                         self.page_size,
                         # Disable flashinfer's pos encoding and use vllm's rope.
                         pos_encoding_mode="NONE",
-                        sm_scale=self.global_hyperparameters.sm_scale,
-                        window_left=self.global_hyperparameters.window_left,
-                        logits_soft_cap=self.global_hyperparameters.
-                        logits_soft_cap,
+                        sm_scale=self.sm_scale,
+                        window_left=self.window_left,
+                        logits_soft_cap=self.logits_soft_cap,
                         q_data_type=self.q_data_type,
                         kv_data_type=self.kv_cache_dtype,
                     )

From e03940762b43812fccd3c214bda60201cff9d16a Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 27 Aug 2025 18:59:35 +0800
Subject: [PATCH 080/112] [CI/Build] Reduce LoRA layer test cases (#23721)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/test_layers.py | 72 ++++++++++++++++++---------------------
 1 file changed, 33 insertions(+), 39 deletions(-)

diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 92db023babc28..6e2dda464d8eb 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -243,7 +243,7 @@ def check_punica_wrapper(punica_wrapper) -> bool:
 
 
 @torch.inference_mode()
-@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("num_loras", [1, 2, 4])
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
 @pytest.mark.parametrize("stage", STAGES)
@@ -347,7 +347,7 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
 @torch.inference_mode()
 # @pytest.mark.skip(
 #     reason="Fails when loras are in any slot other than the first.")
-@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("num_loras", [1, 2, 4])
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
 @pytest.mark.parametrize("stage", STAGES)
@@ -486,7 +486,7 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
 
 
 @torch.inference_mode()
-@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("num_loras", [1, 2, 4])
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 256512])
 @pytest.mark.parametrize("stage", STAGES)
@@ -620,12 +620,15 @@ def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,
 
 
 @torch.inference_mode()
-@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("num_loras", [1, 2, 4])
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
-@pytest.mark.parametrize("bias_enabled", [True, False])
-def test_linear_replicated(dist_init, num_loras, device, stage,
-                           bias_enabled) -> None:
+def test_linear_replicated(
+    dist_init,
+    num_loras,
+    device,
+    stage,
+) -> None:
 
     if current_platform.is_cuda_alike():
         torch.cuda.set_device(device)
@@ -634,10 +637,11 @@ def test_linear_replicated(dist_init, num_loras, device, stage,
     torch.set_default_device(device)
     punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
     assert check_punica_wrapper(punica_wrapper)
-    lora_config = LoRAConfig(max_loras=max_loras,
-                             max_lora_rank=8,
-                             lora_dtype=torch.float16,
-                             bias_enabled=bias_enabled)
+    lora_config = LoRAConfig(
+        max_loras=max_loras,
+        max_lora_rank=8,
+        lora_dtype=torch.float16,
+    )
 
     def create_random_linear_replicated_layer():
 
@@ -651,10 +655,6 @@ def test_linear_replicated(dist_init, num_loras, device, stage,
         lora_linear.create_lora_weights(max_loras, lora_config)
         assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
             lora_linear.lora_b_stacked) == 1)
-        if bias_enabled:
-            assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
-        else:
-            assert lora_linear.lora_bias_stacked is None
         return linear, lora_linear
 
     for i in range(NUM_RANDOM_SEEDS):
@@ -734,14 +734,13 @@ def test_linear_replicated(dist_init, num_loras, device, stage,
 
 
 @torch.inference_mode()
-@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("num_loras", [1, 2, 4])
 @pytest.mark.parametrize("orientation", ["row", "column"])
 @pytest.mark.parametrize("fully_shard", [True, False])
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
-@pytest.mark.parametrize("bias_enabled", [True, False])
 def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
-                         device, stage, bias_enabled) -> None:
+                         device, stage) -> None:
 
     if current_platform.is_cuda_alike():
         torch.cuda.set_device(device)
@@ -750,11 +749,12 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
     torch.set_default_device(device)
     punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
     assert check_punica_wrapper(punica_wrapper)
-    lora_config = LoRAConfig(max_loras=max_loras,
-                             max_lora_rank=8,
-                             fully_sharded_loras=fully_shard,
-                             lora_dtype=torch.float16,
-                             bias_enabled=bias_enabled)
+    lora_config = LoRAConfig(
+        max_loras=max_loras,
+        max_lora_rank=8,
+        fully_sharded_loras=fully_shard,
+        lora_dtype=torch.float16,
+    )
 
     def create_random_linear_parallel_layer():
         if orientation == "row":
@@ -777,10 +777,7 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
         lora_linear.create_lora_weights(max_loras, lora_config)
         assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
             lora_linear.lora_b_stacked) == 1)
-        if bias_enabled:
-            assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
-        else:
-            assert lora_linear.lora_bias_stacked is None
+
         return linear, lora_linear
 
     for i in range(NUM_RANDOM_SEEDS):
@@ -860,14 +857,13 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
 
 
 @torch.inference_mode()
-@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("num_loras", [1, 2, 4])
 @pytest.mark.parametrize("repeats", [1, 2, 3])
 @pytest.mark.parametrize("fully_shard", [True, False])
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
-@pytest.mark.parametrize("bias_enabled", [True, False])
 def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
-                                device, stage, bias_enabled) -> None:
+                                device, stage) -> None:
 
     if current_platform.is_cuda_alike():
         torch.cuda.set_device(device)
@@ -876,11 +872,12 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
     torch.set_default_device(device)
     punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
     assert check_punica_wrapper(punica_wrapper)
-    lora_config = LoRAConfig(max_loras=max_loras,
-                             max_lora_rank=8,
-                             fully_sharded_loras=fully_shard,
-                             lora_dtype=torch.float16,
-                             bias_enabled=bias_enabled)
+    lora_config = LoRAConfig(
+        max_loras=max_loras,
+        max_lora_rank=8,
+        fully_sharded_loras=fully_shard,
+        lora_dtype=torch.float16,
+    )
 
     def create_column_parallel_packed_layer():
         if repeats == 2:
@@ -924,10 +921,7 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
                                         model_config=FakeConfig())
         assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
             lora_linear.lora_b_stacked) == n_slices)
-        if bias_enabled:
-            assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
-        else:
-            assert lora_linear.lora_bias_stacked is None
+
         return linear, lora_linear
 
     for i in range(NUM_RANDOM_SEEDS):

From 8f0d7eaea87409a54ccaed76995b59c6b0a3d4cf Mon Sep 17 00:00:00 2001
From: Fanli Lin <fanli0116@gmail.com>
Date: Wed, 27 Aug 2025 19:57:38 +0800
Subject: [PATCH 081/112] [XPU] Fix OOM issue for data parallel with Ray
 backend (#22500)

Signed-off-by: Fanli Lin <fanli.lin@intel.com>
Signed-off-by: Fanli Lin <fanli0116@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 vllm/v1/engine/core.py  | 27 ++++++++++++++++++---------
 vllm/v1/engine/utils.py | 35 +++++++++++++++++++++++++++++++----
 2 files changed, 49 insertions(+), 13 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index b614828061846..a7038e2d2c264 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -39,7 +39,8 @@ from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
                             EngineCoreRequestType,
                             ReconfigureDistributedRequest, ReconfigureRankType,
                             UtilityOutput, UtilityResult)
-from vllm.v1.engine.utils import EngineHandshakeMetadata, EngineZmqAddresses
+from vllm.v1.engine.utils import (EngineHandshakeMetadata, EngineZmqAddresses,
+                                  get_device_indices)
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.metrics.stats import SchedulerStats
@@ -1169,22 +1170,30 @@ class DPEngineCoreActor(DPEngineCoreProc):
         # https://github.com/ray-project/ray/pull/40461/files#diff-31e8159767361e4bc259b6d9883d9c0d5e5db780fcea4a52ead4ee3ee4a59a78R1860 # noqa: E501
         # and get_accelerator_ids_for_accelerator_resource() in worker.py
         # of ray.
-        self._set_cuda_visible_devices(vllm_config, local_dp_rank)
+        self._set_visible_devices(vllm_config, local_dp_rank)
 
         super().__init__(vllm_config, local_client, "", executor_class,
                          log_stats)
 
-    def _set_cuda_visible_devices(self, vllm_config: VllmConfig,
-                                  local_dp_rank: int):
+    def _set_visible_devices(self, vllm_config: VllmConfig,
+                             local_dp_rank: int):
         from vllm.platforms import current_platform
-        device_control_env_var = current_platform.device_control_env_var
+        if current_platform.is_xpu():
+            pass
+        else:
+            device_control_env_var = current_platform.device_control_env_var
+            self._set_cuda_visible_devices(vllm_config, local_dp_rank,
+                                           device_control_env_var)
+
+    def _set_cuda_visible_devices(self, vllm_config: VllmConfig,
+                                  local_dp_rank: int,
+                                  device_control_env_var: str):
         world_size = vllm_config.parallel_config.world_size
         # Set CUDA_VISIBLE_DEVICES or equivalent.
         try:
-            os.environ[device_control_env_var] = ",".join(
-                str(current_platform.device_id_to_physical_device_id(i))
-                for i in range(local_dp_rank *
-                               world_size, (local_dp_rank + 1) * world_size))
+            value = get_device_indices(device_control_env_var, local_dp_rank,
+                                       world_size)
+            os.environ[device_control_env_var] = value
         except IndexError as e:
             raise Exception(
                 f"Error setting {device_control_env_var}: "
diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py
index 62f229e286931..56ef8477d267a 100644
--- a/vllm/v1/engine/utils.py
+++ b/vllm/v1/engine/utils.py
@@ -164,19 +164,33 @@ def set_device_control_env_var(vllm_config: VllmConfig,
     """
     world_size = vllm_config.parallel_config.world_size
     evar = current_platform.device_control_env_var
+
+    value = get_device_indices(evar, local_dp_rank, world_size)
+    with patch.dict(os.environ, values=((evar, value), )):
+        yield
+
+
+def get_device_indices(device_control_env_var: str, local_dp_rank: int,
+                       world_size: int):
+    """
+    Returns a comma-separated string of device indices for the specified
+    data parallel rank.
+
+    For example, if world_size=2 and local_dp_rank=1, and there are 4 devices,
+    this will select devices 2 and 3 for local_dp_rank=1.
+    """
     try:
         value = ",".join(
             str(current_platform.device_id_to_physical_device_id(i))
             for i in range(local_dp_rank * world_size, (local_dp_rank + 1) *
                            world_size))
     except IndexError as e:
-        raise Exception(f"Error setting {evar}: "
+        raise Exception(f"Error setting {device_control_env_var}: "
                         f"local range: [{local_dp_rank * world_size}, "
                         f"{(local_dp_rank + 1) * world_size}) "
                         "base value: "
-                        f"\"{os.getenv(evar)}\"") from e
-    with patch.dict(os.environ, values=((evar, value), )):
-        yield
+                        f"\"{os.getenv(device_control_env_var)}\"") from e
+    return value
 
 
 class CoreEngineActorManager:
@@ -254,6 +268,19 @@ class CoreEngineActorManager:
             dp_vllm_config = copy.deepcopy(vllm_config)
             dp_vllm_config.parallel_config.placement_group = pg
             local_client = index < local_engine_count
+
+            # Ray XPU known issue: dpctl initializes the GPU runtime early, so
+            # setting device env vars in Ray actor's initialization method
+            # will not affect device selection. See:
+            # https://github.com/ray-project/ray/blob/master/python/ray/_private/accelerators/intel_gpu.py#L56 # noqa: E501
+            if current_platform.is_xpu():
+                device_evar = current_platform.device_control_env_var
+                device_indices = get_device_indices(device_evar, local_index,
+                                                    world_size)
+                actor_env_vars = self.env_vars_dict.copy()
+                actor_env_vars[device_evar] = device_indices
+                runtime_env = RuntimeEnv(env_vars=actor_env_vars)
+
             actor = ray.remote(DPEngineCoreActor).options(
                 scheduling_strategy=PlacementGroupSchedulingStrategy(
                     placement_group=pg,

From 1f7a9c95e4b2a1e02b19e94fd7371443f08b2e4b Mon Sep 17 00:00:00 2001
From: Michael Yao <haifeng.yao@daocloud.io>
Date: Wed, 27 Aug 2025 20:37:52 +0800
Subject: [PATCH 082/112] [Docs] Fix a 1-2-3 list and style issues in tpu.md
 (#23729)

Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
---
 docs/configuration/tpu.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/configuration/tpu.md b/docs/configuration/tpu.md
index ac2b6baffd14e..e456077e04958 100644
--- a/docs/configuration/tpu.md
+++ b/docs/configuration/tpu.md
@@ -45,30 +45,30 @@ This initial compilation time ranges significantly and is impacted by many of th
 
 ### Optimize based on your data
 
-#### max model len vs. most model len
+#### max-model-len vs. most-model-len
 
 ![most_model_len](../assets/design/tpu/most_model_len.png)
 
-If most of your requests are shorter than the maximum model length but you still need to accommodate occasional longer requests, setting a high maximum model length can negatively impact performance. In these cases, you can try introducing most model len by specifying the `VLLM_TPU_MOST_MODEL_LEN` environment variable.
+If most of your requests are shorter than the maximum model length but you still need to accommodate occasional longer requests, setting a high maximum model length can negatively impact performance. In these cases, you can try introducing most-model-len by specifying the `VLLM_TPU_MOST_MODEL_LEN` environment variable.
 
 For example, 1% requests are 32k length and 99% requests are 2k length. You can pass 32k into `--max-model-len 32768` and use `VLLM_TPU_MOST_MODEL_LEN=2048`.
 
-The requests get subdivided into max-model-len and most-model-len categories, for the latter category, we can gain better performance since the server can process more requests at a time.
+The requests get subdivided into max-model-len and most-model-len categories, for the latter category, you can gain better performance since the server can process more requests at a time.
 
 #### Padding
 
-For online serving with latency requirements, consider switching to bucket padding by setting the `VLLM_TPU_BUCKET_PADDING_GAP` environment variable. Because of the layout of the TPU, try using increments of 128: 128, 256, etc.
+For online serving with latency requirements, consider switching to bucket padding by setting the `VLLM_TPU_BUCKET_PADDING_GAP` environment variable. Because of the layout of the TPU, try using increments of 128 (e.g., 128, 256, etc.)
 
-The server pads the requests into fixed lengths before sending them to the model to avoid recompilation. To read more about tpu padding, see [here](https://cloud.google.com/tpu/docs/performance-guide#xla-efficiencies). Currently, there are 2 ways to pad the requests:
+The server pads the requests into fixed lengths before sending them to the model to avoid recompilation. To read more about TPU padding, see [here](https://cloud.google.com/tpu/docs/performance-guide#xla-efficiencies). Currently, there are 2 ways to pad the requests:
 
-1) the default exponential padding (pad to the nearest power of 2)
-2) bucket padding (pad to the nearest linearly increasing bucket).
+1. the default exponential padding (pad to the nearest power of 2)
+2. bucket padding (pad to the nearest linearly increasing bucket).
 
 When using bucket padding, the buckets start from 16, end at max_model_len, and increment by `VLLM_TPU_BUCKET_PADDING_GAP`.
 
 For example, max_model_len=512, padding_gap=64, the buckets will be [16, 32, 64, 128, 192, 256, 320, 384, 448, 512].
 
-The fewer tokens we pad, the less unnecessary computation TPU does, the better performance we can get. For example, if num_tokens=300, with exponential padding, we pad to 512, with the bucket_padding above, we pad to 320.
+The fewer tokens you pad, the less unnecessary computation TPU does, the better performance you can get. For example, if num_tokens=300, with exponential padding, you pad to 512, with the bucket_padding above, you pad to 320.
 
 However, you need to be careful to choose the padding gap. If the gap is too small, it means the number of buckets is large, leading to increased warmup (precompile) time and higher memory to store the compiled graph. Too many compiled graphs may lead to HBM OOM. Conversely, an overly large gap yields no performance improvement compared to the default exponential padding.
 

From 9d30de44698e1e337e4736ff62b83ebe1bbd4d40 Mon Sep 17 00:00:00 2001
From: tc-mb <157115220+tc-mb@users.noreply.github.com>
Date: Wed, 27 Aug 2025 20:38:00 +0800
Subject: [PATCH 083/112] [model] Support MiniCPM-V 4.5 (#23586)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: tc-mb <caitianchi@modelbest.cn>
Signed-off-by: Xin Yang <xyangx@amazon.com>
Signed-off-by: Abatom <abzhonghua@gmail.com>
Signed-off-by: chzhang <chaojun.zhang@intel.com>
Signed-off-by: Pate Motter <patemotter@google.com>
Signed-off-by: Terrencezzj <terrence@cohere.ai>
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
Signed-off-by: simon-mo <simon.mo@hey.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Siyuan Fu <siyuanf@nvidia.com>
Signed-off-by: siyuanf <siyuanf@nvidia.com>
Signed-off-by: Weiliang Liu <weiliangl@nvidia.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Signed-off-by: Zijing Liu <liuzijing2014@gmail.com>
Signed-off-by: Zijing Liu <liuzijing2014@users.noreply.github.com>
Signed-off-by: jiabin.00 <jiabin.00@bytedance.com>
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: tc-mb <157115220+tc-mb@users.noreply.github.com>
Signed-off-by: Roger Wang <hey@rogerw.me>
Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Huy Do <huydhn@gmail.com>
Signed-off-by: Matúš Námešný <matus.namesny@ameria.com>
Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: oye93 <en.ouyang93@outlook.com>
Signed-off-by: Julien Lin <jullin@nvidia.com>
Signed-off-by: Didier Durand <durand.didier@gmail.com>
Signed-off-by: Tianyu Li <tianyu.li@arm.com>
Signed-off-by: Hongxia Yang <hongxia.yang@amd.com>
Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Signed-off-by: jiang1.li <jiang1.li@intel.com>
Signed-off-by: Zerohertz <ohg3417@gmail.com>
Signed-off-by: Hyogeun Oh (오효근) <ohg3417@gmail.com>
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Huzaifa Sidhpurwala <huzaifas@redhat.com>
Signed-off-by: Federico <65908512+coval3nte@users.noreply.github.com>
Signed-off-by: Zixuan Zhang <zixuanzhang@bytedance.com>
Signed-off-by: wuhang <wuhang6@huawei.com>
Signed-off-by: czhu-cohere <conway.zhu@cohere.com>
Signed-off-by: Wei Wei <wwei6@meta.com>
Signed-off-by: Yiheng Xu <charlesyihengxu@gmail.com>
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
Signed-off-by: wangyafeng <wangyafeng@baidu.com>
Co-authored-by: Xin Yang <105740670+xyang16@users.noreply.github.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: Zhonghua Deng <abzhonghua@gmail.com>
Co-authored-by: Chaojun Zhang <chaojun.zhang@intel.com>
Co-authored-by: Pate Motter <p@temotter.com>
Co-authored-by: Terrence Zhao <32208165+Terrencezzj@users.noreply.github.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Simon Mo <simon.mo@hey.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: weiliang <weiliangl@nvidia.com>
Co-authored-by: Siyuan Fu <siyuanf@nvidia.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Copilot <198982749+Copilot@users.noreply.github.com>
Co-authored-by: ProExpertProg <11367180+ProExpertProg@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Zijing Liu <liuzijing2014@users.noreply.github.com>
Co-authored-by: Bin Jia <45593998+FoolPlayer@users.noreply.github.com>
Co-authored-by: Jiangyun Zhu <riverclouds.zhu@qq.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Raghavan <oneraghavan@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.me>
Co-authored-by: knlnguyen1802 <knlnguyen1802@gmail.com>
Co-authored-by: Huy Do <huydhn@gmail.com>
Co-authored-by: Matúš Námešný <matus@namesny.com>
Co-authored-by: Guillaume Calmettes <gcalmettes@scaleway.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: En Ouyang <en.ouyang93@outlook.com>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
Co-authored-by: nvjullin <jullin@nvidia.com>
Co-authored-by: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Co-authored-by: TianyuLi0 <116711075+TianyuLi0@users.noreply.github.com>
Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Co-authored-by: Yuekai Zhang <zhangyuekai@foxmail.com>
Co-authored-by: vllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: Hyogeun Oh (오효근) <ohg3417@gmail.com>
Co-authored-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Lukas Geiger <lukas.geiger94@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Huzaifa Sidhpurwala <huzaifas@redhat.com>
Co-authored-by: Federico <65908512+coval3nte@users.noreply.github.com>
Co-authored-by: zixuanzhang226 <zixuanzhang@bytedance.com>
Co-authored-by: wuhang <wuhang6@huawei.com>
Co-authored-by: yzds <41983536+youzhedian@users.noreply.github.com>
Co-authored-by: hongchao <hongchao@msh.team>
Co-authored-by: czhu-cohere <conway.zhu@cohere.com>
Co-authored-by: Wei <weiweinpu@gmail.com>
Co-authored-by: Yiheng Xu <charlesyihengxu@gmail.com>
Co-authored-by: Aaron Pham <contact@aarnphm.xyz>
Co-authored-by: Chenheli Hua <huachenheli@outlook.com>
Co-authored-by: CSWYF3634076 <58356743+CSWYF3634076@users.noreply.github.com>
---
 docs/models/supported_models.md               |   2 +-
 tests/models/registry.py                      |   2 +-
 vllm/model_executor/models/minicpmv.py        | 314 +++++++++++++++++-
 .../chat_templates/registry.py                |  11 +
 .../chat_templates/template_minicpmv45.jinja  |  93 ++++++
 5 files changed, 407 insertions(+), 15 deletions(-)
 create mode 100644 vllm/transformers_utils/chat_templates/template_minicpmv45.jinja

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 19ce8c06724f4..35a5fa0c2e42f 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -638,7 +638,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | ✅︎ |
 | `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I<sup>+</sup> + V<sup>+</sup> | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ | ✅︎ |
 | `MiniCPMO` | MiniCPM-O | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>E+</sup> | `openbmb/MiniCPM-o-2_6`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, `openbmb/MiniCPM-V-4`, etc. | ✅︎ | | ✅︎ |
+| `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, `openbmb/MiniCPM-V-4`, `openbmb/MiniCPM-V-4_5`, etc. | ✅︎ | | ✅︎ |
 | `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + I<sup>E+</sup> | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | ✅︎ |
 | `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `MllamaForConditionalGeneration` | Llama 3.2 | T + I<sup>+</sup> | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. | | | |
diff --git a/tests/models/registry.py b/tests/models/registry.py
index f2c09d3e8452a..ee546e7af85c6 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -451,7 +451,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     "MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6",
                                 trust_remote_code=True),
     "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
-                                extras={"2.6": "openbmb/MiniCPM-V-2_6", "4.0": "openbmb/MiniCPM-V-4"},  # noqa: E501
+                                extras={"2.6": "openbmb/MiniCPM-V-2_6", "4.0": "openbmb/MiniCPM-V-4", "4.5": "openbmb/MiniCPM-V-4_5"},  # noqa: E501
                                 trust_remote_code=True),
     "MiniMaxVL01ForConditionalGeneration": _HfExamplesInfo("MiniMaxAI/MiniMax-VL-01", # noqa: E501
                                               trust_remote_code=True,
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index c22d871ab20d9..2d785c30fd7df 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -27,12 +27,14 @@ import math
 from collections import defaultdict
 from collections.abc import Iterable, Mapping, Sequence
 from functools import partial
+from itertools import chain
 from typing import Annotated, Any, Callable, Literal, Optional, Union
 
 import numpy as np
 import torch
 import torch.types
 from torch import nn
+from torch.nn.init import trunc_normal_
 from transformers import BatchFeature, PretrainedConfig
 from typing_extensions import TypeVar
 
@@ -47,10 +49,11 @@ from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.model_executor.models.minicpm import MiniCPMForCausalLM
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM
+from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    NestedTensors)
+                                    MultiModalKwargsItems, NestedTensors)
 from vllm.multimodal.parse import (DictEmbeddingItems, ImageItem,
                                    ImageProcessorItems, ImageSize,
                                    ModalityData, ModalityDataItems,
@@ -218,6 +221,187 @@ class Resampler2_5(BaseResampler):
         return x
 
 
+class Resampler4_5(Resampler2_5):
+
+    def __init__(self,
+                 num_queries: int,
+                 embed_dim: int,
+                 num_heads: int,
+                 kv_dim: Optional[int] = None,
+                 norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
+                 max_size: tuple[int, int] = (70, 70),
+                 max_temporal_size: int = 36000,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = "") -> None:
+        super().__init__(num_queries,
+                         embed_dim,
+                         num_heads,
+                         kv_dim,
+                         norm_layer,
+                         max_size,
+                         quant_config=quant_config,
+                         prefix=prefix)
+
+        trunc_normal_(self.query, std=.02)
+        self.max_temporal_size = max_temporal_size
+        self._set_temporal_pos_cache(self.max_temporal_size)
+        self.apply(self._init_weights)
+
+    def get_1d_sincos_pos_embed_from_temporal_size(self, embed_dim: int,
+                                                   pos: np.ndarray):
+        """
+        embed_dim: output dimension for each position
+        pos: a list of positions to be encoded: size (M,)
+        out: (M, D)
+        """
+        assert embed_dim % 2 == 0
+        omega = np.arange(embed_dim // 2, dtype=np.float32)
+        omega /= embed_dim / 2.
+        omega = 1. / 10000**omega  # (D/2,)
+
+        pos = pos.reshape(-1)  # (M,)
+        out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+
+        emb_sin = np.sin(out)  # (M, D/2)
+        emb_cos = np.cos(out)  # (M, D/2)
+
+        emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+        return emb
+
+    def _set_temporal_pos_cache(self,
+                                max_temporal_size: int,
+                                device: torch.types.Device = "cpu") -> None:
+        temporal_size = np.arange(max_temporal_size, dtype=np.float32)
+        pos_embed = torch.from_numpy(
+            self.get_1d_sincos_pos_embed_from_temporal_size(
+                self.embed_dim, temporal_size)).float().to(device)
+        self.register_buffer("temporal_pos_embed", pos_embed, persistent=False)
+
+    def _adjust_temporal_pos_cache(self,
+                                   max_temporal_size: int,
+                                   device: torch.types.Device = "cpu"):
+        if max_temporal_size > self.max_temporal_size:
+            self.max_temporal_size = max_temporal_size
+            self._set_temporal_pos_cache(self.max_temporal_size, device)
+
+    def _init_weights(self, m: Union[nn.Linear, nn.LayerNorm]):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        tgt_sizes: torch.Tensor,
+        # temporal_ids for high refresh rate videos
+        temporal_ids=None
+    ) -> torch.Tensor:
+        assert x.shape[0] == tgt_sizes.shape[0]
+        bs = x.shape[0]
+
+        device = x.device
+        dtype = x.dtype
+
+        patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1]
+
+        self._adjust_pos_cache(tgt_sizes, device=device)
+
+        temporal_pos_emb = False
+        temporal_ids_flatten = None
+        if temporal_ids is not None:
+            # example: [[-1], [-1], [2, 6, 9]]
+            temporal_ids_flatten = list(chain.from_iterable(temporal_ids))
+            max_temporal_size = max(temporal_ids_flatten, default=0)
+            if max_temporal_size > -1:
+                temporal_pos_emb = True
+            if max_temporal_size > self.max_temporal_size:
+                self._adjust_temporal_pos_cache(max_temporal_size, device)
+
+        max_patch_len = patch_len.max().item()
+        assert isinstance(max_patch_len, int)
+
+        key_padding_mask = torch.zeros((bs, max_patch_len),
+                                       dtype=torch.bool,
+                                       device=device)
+
+        x, _ = self.kv_proj(x)  # B * L * D
+        x = self.ln_kv(x).permute(1, 0, 2)  # L * B * D
+        q = self.ln_q(self.query)  # Q * D
+
+        pos_embed_2d = []
+        pos_embed_temporal = []
+        for i in range(bs):
+            tgt_h, tgt_w = tgt_sizes[i]
+            if temporal_pos_emb:
+                if temporal_ids_flatten[i] == -1:
+                    pos_embed_temporal.append(
+                        torch.zeros(self.embed_dim, dtype=dtype,
+                                    device=device))
+                else:
+                    pos_embed_temporal.append(self.temporal_pos_embed[
+                        temporal_ids_flatten[i]].to(dtype))  # D
+
+            pos_embed_2d.append(self.pos_embed[:tgt_h, :tgt_w, :].reshape(
+                (tgt_h * tgt_w, -1)).to(dtype))  # patches * D
+            key_padding_mask[i, patch_len[i]:] = True
+
+        pos_embed_2d = torch.nn.utils.rnn.pad_sequence(
+            pos_embed_2d, batch_first=True,
+            padding_value=0.0).permute(1, 0, 2)  # BLD => L * B * D
+
+        k = x
+        v = x + pos_embed_2d
+        if pos_embed_temporal:
+            k += torch.stack(pos_embed_temporal, dim=0)
+            bs = len(temporal_ids)
+            merge_k = []
+            merge_v = []
+            merge_key_padding_mask = []
+
+            start = 0
+            for tp in temporal_ids:
+                end = start + len(tp)
+                # L * (end-start) * D -> (end-start) * L * D
+                # -> 1 * L*(end-start) * D
+                merge_k.append(k[:, start:end, :].permute(1, 0, 2).reshape(
+                    -1, self.embed_dim))
+                merge_v.append(v[:, start:end, :].permute(1, 0, 2).reshape(
+                    -1, self.embed_dim))
+                merge_key_padding_mask.append(
+                    key_padding_mask[start:end, :].reshape(-1, 1))
+
+                start = end
+
+            k = torch.nn.utils.rnn.pad_sequence(merge_k,
+                                                batch_first=True,
+                                                padding_value=0.0).permute(
+                                                    1, 0, 2)  # L*(end-start)
+            v = torch.nn.utils.rnn.pad_sequence(merge_v,
+                                                batch_first=True,
+                                                padding_value=0.0).permute(
+                                                    1, 0, 2)  # L*(end-start)
+            key_padding_mask = torch.nn.utils.rnn.pad_sequence(
+                merge_key_padding_mask, batch_first=True,
+                padding_value=True).squeeze(-1)
+
+        out = self.attn(
+            self._repeat(q, bs),  # Q * B * D
+            k,  # L * B * D +  L * B * D
+            v,
+            key_padding_mask=key_padding_mask,
+        )[0]
+        #  out: Q * B * D
+        x = out.permute(1, 0, 2)  # B * Q * D
+
+        x = self.ln_post(x)
+        x = x @ self.proj
+        return x
+
+
 def get_version_by_config(config: PretrainedConfig) -> tuple[int, ...]:
     version_float = getattr(config, "version", None)
 
@@ -354,9 +538,7 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         mm_limits = {"image": None}
-        if self.get_model_version() == (2,
-                                        6) or self.get_model_version() == (4,
-                                                                           0):
+        if self.get_model_version() in {(2, 6), (4, 0), (4, 5)}:
             mm_limits["video"] = None
 
         return mm_limits
@@ -637,8 +819,7 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
         out_keys: set[str],
     ) -> dict[str, NestedTensors]:
         # This processor supports zipping prompt and mm_data together
-        if self.info.get_model_version() == (
-                2, 6) or self.info.get_model_version() == (4, 0):
+        if self.info.get_model_version() in {(2, 6), (4, 0), (4, 5)}:
             inputs = super()._call_hf_processor(
                 prompt=prompts,  # type: ignore
                 mm_data=mm_data,
@@ -816,7 +997,6 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
         # and config class
         self.config = config
         self.multimodal_config = multimodal_config
-        self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
 
         self.version = get_version_by_config(self.config)
         self.llm = self.init_llm(vllm_config=vllm_config,
@@ -1364,11 +1544,9 @@ class MiniCPMV4_0(MiniCPMVBaseModel, SupportsLoRA):
         prefix: str = "",
     ) -> nn.Module:
         quant_config = self._maybe_ignore_quant_config(quant_config)
-        model = Idefics2VisionTransformer(
-            config.vision_config,
-            quant_config=quant_config,
-            prefix=prefix,
-            use_data_parallel=self.use_data_parallel)
+        model = Idefics2VisionTransformer(config.vision_config,
+                                          quant_config=quant_config,
+                                          prefix=prefix)
         if self.config.drop_vision_last_layer:
             model.encoder.layers = model.encoder.layers[:-1]
         return model
@@ -1436,11 +1614,121 @@ class MiniCPMV4_0(MiniCPMVBaseModel, SupportsLoRA):
         return loader.load_weights(weights)
 
 
+class MiniCPMV4_5(MiniCPMVBaseModel, SupportsLoRA):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        assert self.version == (4, 5)
+
+    def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
+        if isinstance(quant_config, (AWQConfig, AWQMarlinConfig)):
+            return None
+        return quant_config
+
+    def init_llm(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ) -> nn.Module:
+        return Qwen3ForCausalLM(vllm_config=vllm_config, prefix=prefix)
+
+    def init_vision_module(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> nn.Module:
+        quant_config = self._maybe_ignore_quant_config(quant_config)
+        model = Idefics2VisionTransformer(config.vision_config,
+                                          quant_config=quant_config,
+                                          prefix=prefix)
+        if self.config.drop_vision_last_layer:
+            model.encoder.layers = model.encoder.layers[:-1]
+        return model
+
+    def init_resampler(
+        self,
+        embed_dim: int,
+        vision_dim: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> nn.Module:
+        quant_config = self._maybe_ignore_quant_config(quant_config)
+        with set_default_torch_dtype(torch.float16):
+            # The resampler in 4.0 remains consistent with the one in 2.5/2.6.
+            resampler = Resampler4_5(num_queries=self.config.query_num,
+                                     embed_dim=embed_dim,
+                                     num_heads=embed_dim // 128,
+                                     kv_dim=vision_dim,
+                                     quant_config=quant_config,
+                                     prefix=prefix)
+
+        return resampler.to(device=current_platform.device_type,
+                            dtype=torch.get_default_dtype())
+
+    def get_vision_hidden_states(
+            self, data: MiniCPMVImagePixelInputs) -> torch.Tensor:
+        pixel_values = data["pixel_values"]
+        tgt_sizes = data["tgt_sizes"]
+        temporal_ids = data.get('temporal_ids', None)
+
+        B = len(pixel_values)
+        P = pixel_values[0].shape[-2]
+        L = max(item.shape[-1] for item in pixel_values)
+        device = pixel_values[0].device
+        dtype = pixel_values[0].dtype
+
+        all_pixel_values = torch.zeros((B, 3, P, L),
+                                       dtype=dtype,
+                                       device=device)
+        all_temporal_ids = None if temporal_ids is None else flatten_2d_lists(
+            temporal_ids)
+        for i, pixel_values_item in enumerate(pixel_values):
+            L_item = pixel_values_item.shape[-1]
+            all_pixel_values[i, ..., :L_item] = pixel_values_item
+
+        num_patches = tgt_sizes.prod(-1)
+        max_patches = num_patches.max().item()
+        assert isinstance(max_patches, int)
+
+        patch_attn_mask = torch.zeros((B, max_patches),
+                                      dtype=torch.bool,
+                                      device=device)
+        for i, num_patches_item in enumerate(num_patches):
+            patch_attn_mask[i, :num_patches_item] = True
+
+        vision_embedding = self.vpm(
+            all_pixel_values,
+            patch_attention_mask=patch_attn_mask.unsqueeze(1),
+            tgt_sizes=tgt_sizes,
+        )
+
+        return self.resampler(vision_embedding, tgt_sizes, all_temporal_ids)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self,
+                                   skip_prefixes=["apm.", "audio", "tts"])
+        return loader.load_weights(weights)
+
+
 _SUPPORT_VERSION = {
     (2, 0): MiniCPMV2_0,
     (2, 5): MiniCPMV2_5,
     (2, 6): MiniCPMV2_6,
     (4, 0): MiniCPMV4_0,
+    (4, 5): MiniCPMV4_5,
 }
 
 
diff --git a/vllm/transformers_utils/chat_templates/registry.py b/vllm/transformers_utils/chat_templates/registry.py
index e0ef7f0999d47..d09c5fa924fb0 100644
--- a/vllm/transformers_utils/chat_templates/registry.py
+++ b/vllm/transformers_utils/chat_templates/registry.py
@@ -20,6 +20,16 @@ def _get_qwen_chat_template_fallback(
     return CHAT_TEMPLATES_DIR / "template_basic.jinja"
 
 
+def _get_minicpmv_chat_template_fallback(
+        tokenizer_name_or_path: str) -> Optional[Path]:
+    # MiniCPM-V-4.5 version uses a dedicated template
+    if "4.5" in tokenizer_name_or_path or "4_5" in tokenizer_name_or_path:
+        return CHAT_TEMPLATES_DIR / "template_minicpmv45.jinja"
+
+    # Other versions use chatml template
+    return CHAT_TEMPLATES_DIR / "template_chatml.jinja"
+
+
 # yapf: disable
 _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
     "blip-2": CHAT_TEMPLATES_DIR / "template_blip2.jinja",
@@ -27,6 +37,7 @@ _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
     "deepseek_vl_v2": CHAT_TEMPLATES_DIR / "template_deepseek_vl2.jinja",
     "florence2": CHAT_TEMPLATES_DIR / "template_basic.jinja",
     "fuyu": CHAT_TEMPLATES_DIR / "template_fuyu.jinja",
+    "minicpmv": _get_minicpmv_chat_template_fallback,
     "paligemma": CHAT_TEMPLATES_DIR / "template_basic.jinja",
     "qwen": _get_qwen_chat_template_fallback,
 }
diff --git a/vllm/transformers_utils/chat_templates/template_minicpmv45.jinja b/vllm/transformers_utils/chat_templates/template_minicpmv45.jinja
new file mode 100644
index 0000000000000..661ebd1cf5c17
--- /dev/null
+++ b/vllm/transformers_utils/chat_templates/template_minicpmv45.jinja
@@ -0,0 +1,93 @@
+{%- set enable_thinking = enable_thinking | default(false) %}
+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set content = message.content %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in message.content %}
+                {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
+                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+    {%- if enable_thinking is defined and enable_thinking is true %}
+        {{- '<think>\n' }}
+    {%- endif %}
+{%- endif %}
\ No newline at end of file

From 8c13820f0b203976eab8e821c102234a73f338cd Mon Sep 17 00:00:00 2001
From: cndoit18 <cndoit18@outlook.com>
Date: Wed, 27 Aug 2025 20:42:20 +0800
Subject: [PATCH 084/112] [Bugfix] Fix task field initialization when
 PYTHONOPTIMIZE is enabled (#23718)

Signed-off-by: cndoit18 <cndoit18@outlook.com>
---
 vllm/worker/pooling_model_runner.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py
index 8d8d9b4d0503f..3e1950798dbf6 100644
--- a/vllm/worker/pooling_model_runner.py
+++ b/vllm/worker/pooling_model_runner.py
@@ -199,8 +199,9 @@ class PoolingModelRunner(
 
             pooling_params = seq_group_metadata.pooling_params
             assert pooling_params is not None
-            assert (task := pooling_params.task) is not None, (
-                "You did not set `task` in the API")
+
+            task = pooling_params.task
+            assert task is not None, "You did not set `task` in the API"
 
             model = cast(VllmModelForPooling, self.model)
             to_update = model.pooler.get_pooling_updates(task)

From a403d0fa41cc68e3b6da4e1097dc896fde2f1a6a Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Wed, 27 Aug 2025 05:50:47 -0700
Subject: [PATCH 085/112] [Misc] Remove unnecessary `_send_reconfig_message()`
 in `core_client.py` (#23127)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/engine/core_client.py | 31 +++++++++----------------------
 1 file changed, 9 insertions(+), 22 deletions(-)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 079dd9a7d38d1..65f7abc97110c 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1190,21 +1190,6 @@ class DPLBAsyncMPClient(DPAsyncMPClient):
         await self._send_input(EngineCoreRequestType.ABORT, request_ids,
                                engine)
 
-    async def _send_reconfig_message(
-            self, reconfig_request: ReconfigureDistributedRequest,
-            engine: EngineIdentity) -> asyncio.Future:
-        """Send reconfiguration message and return the result future without
-        waiting for completion."""
-        call_id = uuid.uuid1().int >> 64
-        future = asyncio.get_running_loop().create_future()
-        self.utility_results[call_id] = future
-        message = (EngineCoreRequestType.UTILITY.value, *self.encoder.encode(
-            (self.client_index, call_id, "reinitialize_distributed",
-             (reconfig_request, ))))
-        await self._send_input_message(message, engine, reconfig_request)
-        self._ensure_output_queue_task()
-        return future
-
     async def scale_elastic_ep(self, new_data_parallel_size: int) -> None:
         """Scale elastic EP data parallel size"""
         cur_data_parallel_size = len(self.core_engines)
@@ -1214,7 +1199,7 @@ class DPLBAsyncMPClient(DPAsyncMPClient):
             f"different from cur_data_parallel_size {cur_data_parallel_size}")
 
         assert self.vllm_config.parallel_config.data_parallel_backend == \
-            "ray", ("Only ray DP backend supports scaling elastic EP")
+            "ray", "Only ray DP backend supports scaling elastic EP"
 
         scale_up = new_data_parallel_size > cur_data_parallel_size
 
@@ -1246,9 +1231,10 @@ class DPLBAsyncMPClient(DPAsyncMPClient):
                 data_parallel_master_ip,
                 new_data_parallel_master_port=self.vllm_config.parallel_config.
                 data_parallel_master_port)
-            future = await self._send_reconfig_message(reconfig_request,
-                                                       engine)
-            reconfig_futures.append(future)
+            coro = self._call_utility_async("reinitialize_distributed",
+                                            reconfig_request,
+                                            engine=engine)
+            reconfig_futures.append(asyncio.create_task(coro))
 
         logger.info("All reconfigure messages sent, starting engine creation")
 
@@ -1318,9 +1304,10 @@ class DPLBAsyncMPClient(DPAsyncMPClient):
             if cur_dp_rank >= new_data_parallel_size:
                 reconfig_request.new_data_parallel_rank = \
                 ReconfigureRankType.SHUTDOWN_CURRENT_RANK
-            future = await self._send_reconfig_message(reconfig_request,
-                                                       engine)
-            reconfig_futures.append(future)
+            coro = self._call_utility_async("reinitialize_distributed",
+                                            reconfig_request,
+                                            engine=engine)
+            reconfig_futures.append(asyncio.create_task(coro))
 
         for _ in range(new_data_parallel_size, cur_data_parallel_size):
             self.core_engines.pop()

From 704432af3c129b7a57fca9b059eefe214159f836 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Wed, 27 Aug 2025 14:51:54 +0200
Subject: [PATCH 086/112] [V1] [Hybrid] Disable prefix caching by default for
 hybrid or mamba-based models  (#23716)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 docs/usage/v1_guide.md               | 10 ++++++----
 vllm/model_executor/models/config.py |  9 +++++----
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index 64bd0d9bf5071..20234e7611333 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -107,14 +107,16 @@ to enable simultaneous generation and embedding using the same engine instance i
 #### Mamba Models
 
 Models using selective state-space mechanisms instead of standard transformer attention are supported.
-Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`) are supported. Please note that these models currently require disabling prefix caching in V1.
+Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`) are supported.
+Please note that prefix caching is not yet supported for these models.
 
 Models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
-`Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`). Please note that
-these models currently require disabling prefix caching in V1.
+`Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`).
+Please note that prefix caching is not yet supported for these models.
 
 Hybrid models with mechanisms different to Mamba are also supported (e.g, `MiniMaxText01ForCausalLM`, `MiniMaxM1ForCausalLM`).
-Please note that these models currently require disabling prefix caching and enforcing eager mode in V1.
+Please note that prefix caching is not yet supported for these models.
+It is also necessary to enforce eager mode for these models in V1.
 
 #### Encoder-Decoder Models
 
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index f62209326b988..88b3154de2cbb 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -292,12 +292,13 @@ class MambaModelConfig(VerifyAndUpdateConfig):
             return
 
         model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
         compilation_config = vllm_config.compilation_config
 
-        model_cls, _ = ModelRegistry.resolve_model_cls(
-            model_config.architecture,
-            model_config=model_config,
-        )
+        # TODO(tdoublep): remove once prefix caching is enabled
+        cache_config.enable_prefix_caching = False
+        logger.info("Hybrid or mamba-based model detected: disabling prefix "
+                    "caching since it is not yet supported.")
 
         # TODO(tdoublep): remove as full cuda graph support is added
         FCG_NOT_SUPPORTED_MODELS = [

From 5eeef1b90852917b300ed67b98e341eb846ba2e9 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 27 Aug 2025 21:24:09 +0800
Subject: [PATCH 087/112] [Model] Explicit `default_pooling_type` interface
 (#23736)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/bert.py            |  4 +--
 vllm/model_executor/models/bert_with_rope.py  |  5 ++--
 vllm/model_executor/models/gritlm.py          |  2 +-
 vllm/model_executor/models/interfaces.py      | 19 +------------
 vllm/model_executor/models/interfaces_base.py | 28 +++++++++++++++++++
 vllm/model_executor/models/internlm2.py       |  3 +-
 vllm/model_executor/models/modernbert.py      |  3 +-
 .../models/prithvi_geospatial_mae.py          |  7 +++--
 vllm/model_executor/models/qwen2_rm.py        |  3 +-
 vllm/model_executor/models/registry.py        |  7 +++--
 vllm/model_executor/models/roberta.py         |  3 +-
 11 files changed, 51 insertions(+), 33 deletions(-)

diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 22b6c4401213c..b34ca5cbe963d 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -28,8 +28,8 @@ from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import IntermediateTensors
 from vllm.tasks import PoolingTask
 
-from .interfaces import (SupportsCrossEncoding, SupportsQuant,
-                         default_pooling_type)
+from .interfaces import SupportsCrossEncoding, SupportsQuant
+from .interfaces_base import default_pooling_type
 from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
 
 
diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py
index 129450927e564..dcb7e75456cde 100644
--- a/vllm/model_executor/models/bert_with_rope.py
+++ b/vllm/model_executor/models/bert_with_rope.py
@@ -27,13 +27,14 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.interfaces import (SupportsQuant,
-                                                   default_pooling_type)
 from vllm.model_executor.models.utils import WeightsMapper
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
+from .interfaces import SupportsQuant
+from .interfaces_base import default_pooling_type
+
 
 class BertWithRopeEmbedding(nn.Module):
 
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
index 3f6790269ae62..1b3d541c65cf8 100644
--- a/vllm/model_executor/models/gritlm.py
+++ b/vllm/model_executor/models/gritlm.py
@@ -20,7 +20,7 @@ from vllm.sequence import PoolerOutput
 from vllm.tasks import PoolingTask
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
-from .interfaces import default_pooling_type
+from .interfaces_base import default_pooling_type
 
 logger = init_logger(__name__)
 
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 9415e67924e74..22f005849e864 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -3,7 +3,7 @@
 
 from collections.abc import Iterable, Mapping, MutableSequence
 from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol,
-                    TypeVar, Union, overload, runtime_checkable)
+                    Union, overload, runtime_checkable)
 
 import numpy as np
 import torch
@@ -641,23 +641,6 @@ def supports_cross_encoding(
     return is_pooling_model(model) and _supports_cross_encoding(model)
 
 
-_T = TypeVar("_T", bound=type[torch.nn.Module])
-
-
-def default_pooling_type(pooling_type: str):
-    """Set default_pooling_type decorator. """
-
-    def func(model: _T) -> _T:
-        model.default_pooling_type = pooling_type  # type: ignore
-        return model
-
-    return func
-
-
-def get_default_pooling_type(model: Union[type[object], object]) -> str:
-    return getattr(model, "default_pooling_type", "LAST")
-
-
 class SupportsQuant:
     """The interface required for all models that support quantization."""
 
diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index 697fa020deb46..19a3ef1a3b800 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -144,6 +144,17 @@ class VllmModelForPooling(VllmModel[T_co], Protocol[T_co]):
         MRO of your model class.
     """
 
+    default_pooling_type: ClassVar[str] = "LAST"
+    """
+    Indicates the
+    [vllm.model_executor.layers.pooler.PoolerConfig.pooling_type][]
+    to use by default.
+
+    You can use the
+    [vllm.model_executor.models.interfaces_base.default_pooling_type][]
+    decorator to conveniently set this field.
+    """
+
     pooler: Pooler
     """The pooler is only called on TP rank 0."""
 
@@ -165,3 +176,20 @@ def is_pooling_model(
         return False
 
     return getattr(model, "is_pooling_model", False)
+
+
+_T = TypeVar("_T", bound=type[nn.Module])
+
+
+def default_pooling_type(pooling_type: str):
+    """Decorator to set `VllmModelForPooling.default_pooling_type`."""
+
+    def func(model: _T) -> _T:
+        model.default_pooling_type = pooling_type  # type: ignore
+        return model
+
+    return func
+
+
+def get_default_pooling_type(model: Union[type[object], object]) -> str:
+    return getattr(model, "default_pooling_type", "LAST")
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index d0c4bf5450d6d..26bc48ffbd9bc 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -31,7 +31,8 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsLoRA, SupportsPP, default_pooling_type
+from .interfaces import SupportsLoRA, SupportsPP
+from .interfaces_base import default_pooling_type
 from .utils import (is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py
index 72290bf2ee29f..4778555861286 100644
--- a/vllm/model_executor/models/modernbert.py
+++ b/vllm/model_executor/models/modernbert.py
@@ -26,7 +26,8 @@ from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import IntermediateTensors
 from vllm.tasks import PoolingTask
 
-from .interfaces import SupportsCrossEncoding, default_pooling_type
+from .interfaces import SupportsCrossEncoding
+from .interfaces_base import default_pooling_type
 from .utils import WeightsMapper, maybe_prefix
 
 
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
index 59e9f3e8a47b0..f46d6375e1f61 100644
--- a/vllm/model_executor/models/prithvi_geospatial_mae.py
+++ b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -27,9 +27,6 @@ from transformers import BatchFeature
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.interfaces import (
-    IsAttentionFree, MultiModalEmbeddings, SupportsMultiModalWithRawInput,
-    default_pooling_type)
 from vllm.model_executor.models.utils import AutoWeightsLoader
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (ImageItem, ModalityData,
@@ -43,6 +40,10 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
+from .interfaces import (IsAttentionFree, MultiModalEmbeddings,
+                         SupportsMultiModalWithRawInput)
+from .interfaces_base import default_pooling_type
+
 
 def _prithvi_field_config(hf_inputs: Mapping[str, torch.Tensor]):
     # This model receives in input a multi-dimensional tensor representing
diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py
index e0a30e04c602a..421b43563bade 100644
--- a/vllm/model_executor/models/qwen2_rm.py
+++ b/vllm/model_executor/models/qwen2_rm.py
@@ -18,7 +18,8 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
 from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsLoRA, SupportsPP, default_pooling_type
+from .interfaces import SupportsLoRA, SupportsPP
+from .interfaces_base import default_pooling_type
 from .qwen2 import Qwen2Model
 from .utils import AutoWeightsLoader, maybe_prefix
 
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index c65c58d4a047f..196b5f35e1e4f 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -25,11 +25,12 @@ from vllm.logger import init_logger
 from vllm.transformers_utils.dynamic_module import (
     try_get_class_from_dynamic_module)
 
-from .interfaces import (get_default_pooling_type, has_inner_state, has_noops,
-                         is_attention_free, is_hybrid, supports_cross_encoding,
+from .interfaces import (has_inner_state, has_noops, is_attention_free,
+                         is_hybrid, supports_cross_encoding,
                          supports_multimodal, supports_multimodal_raw_input,
                          supports_pp, supports_transcription, supports_v0_only)
-from .interfaces_base import is_pooling_model, is_text_generation_model
+from .interfaces_base import (get_default_pooling_type, is_pooling_model,
+                              is_text_generation_model)
 
 logger = init_logger(__name__)
 
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
index 49a37342c67fa..2bfa51162910b 100644
--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@@ -22,7 +22,8 @@ from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper,
 from vllm.sequence import IntermediateTensors
 
 from .bert_with_rope import BertWithRope, JinaRobertaModel
-from .interfaces import SupportsCrossEncoding, default_pooling_type
+from .interfaces import SupportsCrossEncoding
+from .interfaces_base import default_pooling_type
 
 
 class RobertaEmbedding(nn.Module):

From 8dd2baa5978f123974177023d6efab731153a2f4 Mon Sep 17 00:00:00 2001
From: rebel-hongseok <hongseok@rebellions.ai>
Date: Wed, 27 Aug 2025 22:25:49 +0900
Subject: [PATCH 088/112] Add vLLM Korea Meetup in the README.md and meetups.md
 (#23746)

Signed-off-by: rebel-hongseok <hongseok@rebellions.ai>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 README.md                 | 1 +
 docs/community/meetups.md | 1 +
 2 files changed, 2 insertions(+)

diff --git a/README.md b/README.md
index ef5b43588953c..8812aac4ea266 100644
--- a/README.md
+++ b/README.md
@@ -19,6 +19,7 @@ Easy, fast, and cheap LLM serving for everyone
 *Latest News* 🔥
 
 - [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
+- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
 - [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
 - [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
diff --git a/docs/community/meetups.md b/docs/community/meetups.md
index 61ea44220ad2e..d76238cb31791 100644
--- a/docs/community/meetups.md
+++ b/docs/community/meetups.md
@@ -3,6 +3,7 @@
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
 
 - [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg), August 23rd 2025. [[Slides]](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH)
+- [vLLM Korea Meetup](https://luma.com/cgcgprmh), August 19th 2025. [[Slides]](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view). 
 - [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA), August 2nd 2025. [[Slides]](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) [[Recording]](https://www.chaspark.com/#/live/1166916873711665152).
 - [NYC vLLM Meetup](https://lu.ma/c1rqyf1f), May 7th, 2025. [[Slides]](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing)
 - [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day), April 3rd 2025. [[Slides]](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).

From 16dc4052b004261b547fc50fe7b20e2d2fbf915d Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 27 Aug 2025 14:39:48 +0100
Subject: [PATCH 089/112] Fix pre-commit on main (#23747)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/community/meetups.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/community/meetups.md b/docs/community/meetups.md
index d76238cb31791..221a7bd96213f 100644
--- a/docs/community/meetups.md
+++ b/docs/community/meetups.md
@@ -3,7 +3,7 @@
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
 
 - [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg), August 23rd 2025. [[Slides]](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH)
-- [vLLM Korea Meetup](https://luma.com/cgcgprmh), August 19th 2025. [[Slides]](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view). 
+- [vLLM Korea Meetup](https://luma.com/cgcgprmh), August 19th 2025. [[Slides]](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
 - [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA), August 2nd 2025. [[Slides]](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) [[Recording]](https://www.chaspark.com/#/live/1166916873711665152).
 - [NYC vLLM Meetup](https://lu.ma/c1rqyf1f), May 7th, 2025. [[Slides]](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing)
 - [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day), April 3rd 2025. [[Slides]](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).

From fe8d7b6f03e7d8a36ffb6931397fc81ee594dd64 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 27 Aug 2025 21:41:22 +0800
Subject: [PATCH 090/112] [Model] Interface to enable batch-level DP support
 (#23733)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 docs/configuration/optimization.md       |  7 +++++--
 vllm/config/__init__.py                  |  7 +++++++
 vllm/model_executor/models/interfaces.py | 11 +++++++++++
 vllm/model_executor/models/minicpmv.py   |  2 ++
 vllm/model_executor/models/mllama4.py    |  2 ++
 vllm/model_executor/models/qwen2_5_vl.py |  2 ++
 vllm/model_executor/models/registry.py   |  9 +++++++--
 vllm/model_executor/models/step3_vl.py   |  2 ++
 8 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
index a8eab9985c8b9..b11ccb5c00273 100644
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -168,8 +168,11 @@ llm = LLM(
     Batch-level DP is not to be confused with API request-level DP
     (which is instead controlled by `data_parallel_size`).
 
-The availability of batch-level DP is based on model implementation.
-Currently, the following models support `mm_encoder_tp_mode="data"`:
+Batch-level DP needs to be implemented on a per-model basis,
+and enabled by setting `supports_encoder_tp_data = True` in the model class.
+Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to use this feature.
+
+Known supported models:
 
 - Llama4 (<gh-pr:18368>)
 - MiniCPM-V-4 (<gh-pr:23327>)
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index ac6f51df95498..e3fb6d796def5 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -872,6 +872,13 @@ class ModelConfig:
 
     def _init_multimodal_config(self) -> Optional["MultiModalConfig"]:
         if self._model_info.supports_multimodal:
+            if (self.mm_encoder_tp_mode == "data" and
+                    not self._model_info.supports_multimodal_encoder_tp_data):
+                logger.warning_once(
+                    "This model does not support `--mm-encoder-tp-mode data`. "
+                    "Falling back to `--mm-encoder-tp-mode weights`.")
+                self.mm_encoder_tp_mode = "weights"
+
             return MultiModalConfig(
                 limit_per_prompt=self.limit_mm_per_prompt,
                 media_io_kwargs=self.media_io_kwargs,
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 22f005849e864..506732fed3614 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -52,6 +52,12 @@ class SupportsMultiModal(Protocol):
         MRO of your model class.
     """
 
+    supports_encoder_tp_data: ClassVar[bool] = False
+    """
+    A flag that indicates whether this model supports
+    `multimodal_config.mm_encoder_tp_mode="data"`.
+    """
+
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
         """
@@ -137,6 +143,11 @@ def supports_multimodal(
     return getattr(model, "supports_multimodal", False)
 
 
+def supports_multimodal_encoder_tp_data(
+        model: Union[type[object], object]) -> bool:
+    return getattr(model, "supports_encoder_tp_data", False)
+
+
 @runtime_checkable
 class SupportsMultiModalWithRawInput(SupportsMultiModal, Protocol):
     """The interface required for all multi-modal models."""
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 2d785c30fd7df..0181bfeebda08 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -1521,6 +1521,8 @@ class MiniCPMV4_0(MiniCPMVBaseModel, SupportsLoRA):
         ],
     }
 
+    supports_encoder_tp_data = True
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__(vllm_config=vllm_config, prefix=prefix)
         assert self.version == (4, 0)
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index 595bdd17cf2c2..ac9b968f7a0cd 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -716,6 +716,8 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
         "gate_up_proj": ["gate_proj", "up_proj"],
     }
 
+    supports_encoder_tp_data = True
+
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
         if modality.startswith("image"):
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 648ba81eb3877..b528083b7c9cc 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -868,6 +868,8 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             "model.": "language_model.model.",
         })
 
+    supports_encoder_tp_data = True
+
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
         if modality.startswith("image"):
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 196b5f35e1e4f..80eac78cdfadb 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -27,8 +27,10 @@ from vllm.transformers_utils.dynamic_module import (
 
 from .interfaces import (has_inner_state, has_noops, is_attention_free,
                          is_hybrid, supports_cross_encoding,
-                         supports_multimodal, supports_multimodal_raw_input,
-                         supports_pp, supports_transcription, supports_v0_only)
+                         supports_multimodal,
+                         supports_multimodal_encoder_tp_data,
+                         supports_multimodal_raw_input, supports_pp,
+                         supports_transcription, supports_v0_only)
 from .interfaces_base import (get_default_pooling_type, is_pooling_model,
                               is_text_generation_model)
 
@@ -324,6 +326,7 @@ class _ModelInfo:
     supports_cross_encoding: bool
     supports_multimodal: bool
     supports_multimodal_raw_input: bool
+    supports_multimodal_encoder_tp_data: bool
     supports_pp: bool
     has_inner_state: bool
     is_attention_free: bool
@@ -343,6 +346,8 @@ class _ModelInfo:
             supports_cross_encoding=supports_cross_encoding(model),
             supports_multimodal=supports_multimodal(model),
             supports_multimodal_raw_input=supports_multimodal_raw_input(model),
+            supports_multimodal_encoder_tp_data=
+            supports_multimodal_encoder_tp_data(model),
             supports_pp=supports_pp(model),
             has_inner_state=has_inner_state(model),
             is_attention_free=is_attention_free(model),
diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py
index f8877b584b198..f379d2c15fb6c 100644
--- a/vllm/model_executor/models/step3_vl.py
+++ b/vllm/model_executor/models/step3_vl.py
@@ -867,6 +867,8 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         "lm_head.": "language_model.lm_head.",
     })
 
+    supports_encoder_tp_data = True
+
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
         if modality.startswith("image"):

From 513c1fe255f7d4ec3e91f7f5c2dd2d97c0460765 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 27 Aug 2025 14:55:12 +0100
Subject: [PATCH 091/112] Only run `get_attr_docs` if generating help text
 (#23723)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/engine/arg_utils.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 9e7c95ea5205f..3399d505e3631 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -152,9 +152,17 @@ def is_online_quantization(quantization: Any) -> bool:
     return quantization in ["inc"]
 
 
+NEEDS_HELP = (
+    "--help" in (argv := sys.argv)  # vllm SUBCOMMAND --help
+    or (argv0 := argv[0]).endswith("mkdocs")  # mkdocs SUBCOMMAND
+    or argv0.endswith("mkdocs/__main__.py")  # python -m mkdocs SUBCOMMAND
+)
+
+
 @functools.lru_cache(maxsize=30)
 def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
-    cls_docs = get_attr_docs(cls)
+    # Save time only getting attr docs if we're generating help text
+    cls_docs = get_attr_docs(cls) if NEEDS_HELP else {}
     kwargs = {}
     for field in fields(cls):
         # Get the set of possible types for the field
@@ -172,7 +180,7 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
 
         # Get the help text for the field
         name = field.name
-        help = cls_docs[name].strip()
+        help = cls_docs.get(name, "").strip()
         # Escape % for argparse
         help = help.replace("%", "%%")
 
@@ -254,6 +262,9 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
 def get_kwargs(cls: ConfigType) -> dict[str, Any]:
     """Return argparse kwargs for the given Config dataclass.
 
+    If `--help` or `mkdocs` are not present in the command line command, the
+    attribute documentation will not be included in the help output.
+
     The heavy computation is cached via functools.lru_cache, and a deep copy
     is returned so callers can mutate the dictionary without affecting the
     cached version.

From 3af47c3cc693f432b59658019891393385aa0e2a Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Wed, 27 Aug 2025 10:09:08 -0400
Subject: [PATCH 092/112] [Feature] Add Hopper DeepGEMM E8M0 for DeepSeekV3.1
 scale_fmt (#23666)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
---
 tests/kernels/moe/test_block_fp8.py           |  5 +-
 tests/kernels/moe/test_deepep_deepgemm_moe.py |  7 ++-
 vllm/envs.py                                  |  8 ++-
 .../layers/fused_moe/batched_deep_gemm_moe.py |  4 +-
 .../layers/fused_moe/fused_moe.py             |  7 ++-
 .../layers/fused_moe/triton_deep_gemm_moe.py  |  6 +--
 .../model_executor/layers/quantization/fp8.py |  9 ++--
 .../layers/quantization/utils/fp8_utils.py    |  4 +-
 vllm/transformers_utils/config.py             | 18 +++++++
 vllm/utils/deep_gemm.py                       | 53 +++++++++----------
 10 files changed, 68 insertions(+), 53 deletions(-)

diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py
index 9e4eaf221f245..ecc57acc67963 100644
--- a/tests/kernels/moe/test_block_fp8.py
+++ b/tests/kernels/moe/test_block_fp8.py
@@ -16,7 +16,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (
     fused_topk, modular_triton_fused_moe)
 from vllm.platforms import current_platform
 from vllm.utils import has_deep_gemm
-from vllm.utils.deep_gemm import is_blackwell_deep_gemm_e8m0_used
+from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
 
 dg_available = has_deep_gemm()
 
@@ -226,8 +226,7 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed,
 @pytest.mark.parametrize("topk", TOP_KS)
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.skipif(not dg_available, reason="DeepGemm kernels not available.")
-@pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(),
-                    reason="Not E8M0 scale MOE")
+@pytest.mark.skipif(is_deep_gemm_e8m0_used(), reason="Not E8M0 scale MOE")
 @torch.inference_mode()
 def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed,
                                             monkeypatch):
diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py
index 1e922be47f2b4..36a98522a6588 100644
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -20,8 +20,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
     FusedMoEModularKernel)
 from vllm.platforms import current_platform
 from vllm.utils import has_deep_ep, has_deep_gemm
-from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_e8m0_used,
-                                  is_deep_gemm_supported)
+from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used, is_deep_gemm_supported
 
 from ...utils import multi_gpu_test
 from .parallel_utils import ProcessGroupInfo, parallel_launch
@@ -374,7 +373,7 @@ NUM_EXPERTS = [32]
 @multi_gpu_test(num_gpus=2)
 @requires_deep_ep
 @requires_deep_gemm
-@pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(),
+@pytest.mark.skipif(is_deep_gemm_e8m0_used(),
                     reason="Skipping test for Blackwell DeepGEMM")
 def test_ht_deepep_deepgemm_moe(mnk: tuple[int, int, int], num_experts: int,
                                 topk: int, world_dp_size: tuple[int, int]):
@@ -432,7 +431,7 @@ USE_FP8_DISPATCH = [False]
 @multi_gpu_test(num_gpus=2)
 @requires_deep_ep
 @requires_deep_gemm
-@pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(),
+@pytest.mark.skipif(is_deep_gemm_e8m0_used(),
                     reason="Skipping test for Blackwell DeepGEMM")
 def test_ll_deepep_deepgemm_moe(
     mnk: tuple[int, int, int],
diff --git a/vllm/envs.py b/vllm/envs.py
index 66c7c2c7f2c4d..35735b552575b 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -131,6 +131,7 @@ if TYPE_CHECKING:
     VLLM_TPU_USING_PATHWAYS: bool = False
     VLLM_USE_DEEP_GEMM: bool = False
     VLLM_USE_DEEP_GEMM_E8M0: bool = True
+    VLLM_USE_DEEP_GEMM_E8M0_HOPPER: bool = False
     VLLM_SKIP_DEEP_GEMM_WARMUP: bool = False
     VLLM_USE_FUSED_MOE_GROUPED_TOPK: bool = True
     VLLM_USE_FLASHINFER_MOE_FP8: bool = False
@@ -954,9 +955,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
     lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))),
 
     # Whether to use E8M0 scaling when DeepGEMM is used on Blackwell GPUs.
-    # E8M0 is faster on B200 but may reduce accuracy.
     "VLLM_USE_DEEP_GEMM_E8M0":
     lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM_E8M0", "1"))),
+    # TODO(wentao): unify the two E8M0 flags after verifying the correctness.
+    # Whether to use E8M0 scaling when DeepGEMM is used on Hopper GPUs.
+    "VLLM_USE_DEEP_GEMM_E8M0_HOPPER":
+    lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM_E8M0_HOPPER", "0"))),
     # DeepGemm JITs the kernels on-demand. The warmup attempts to make DeepGemm
     # JIT all the required kernels before model execution so there is no
     # JIT'ing in the hot-path. However, this warmup increases the engine
@@ -1244,6 +1248,8 @@ def compute_hash() -> str:
         "VLLM_USE_FLASHINFER_SAMPLER",
         "VLLM_DISABLED_KERNELS",
         "VLLM_USE_DEEP_GEMM",
+        "VLLM_USE_DEEP_GEMM_E8M0",
+        "VLLM_USE_DEEP_GEMM_E8M0_HOPPER",
         "VLLM_USE_TRTLLM_FP4_GEMM",
         "VLLM_USE_FUSED_MOE_GROUPED_TOPK",
         "VLLM_USE_FLASHINFER_MOE_FP8",
diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
index c4d680af932f0..a5326dfe84f6d 100644
--- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@@ -12,7 +12,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
 from vllm.model_executor.layers.fused_moe.utils import _resize_cache
 from vllm.triton_utils import tl, triton
 from vllm.utils.deep_gemm import (fp8_m_grouped_gemm_nt_masked,
-                                  is_blackwell_deep_gemm_e8m0_used)
+                                  is_deep_gemm_e8m0_used)
 
 logger = init_logger(__name__)
 
@@ -174,7 +174,7 @@ def silu_mul_fp8_quant_deep_gemm(
         eps,
         fp8_min,
         fp8_max,
-        is_blackwell_deep_gemm_e8m0_used(),
+        is_deep_gemm_e8m0_used(),
         BLOCK=group_size,
         NUM_STAGES=4,
         num_warps=1,
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 84dafcf00d821..17a5c735a57fe 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -40,7 +40,7 @@ from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils import direct_register_custom_op, is_torch_equal_or_newer
-from vllm.utils.deep_gemm import is_blackwell_deep_gemm_e8m0_used
+from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
 
 from .rocm_aiter_fused_moe import is_rocm_aiter_moe_enabled
 
@@ -1431,9 +1431,8 @@ def fused_experts(hidden_states: torch.Tensor,
     # E8M0 scale, which means we requantize the weight and input to the specific
     # scale. Fallen back to cutlass or triton for some cases would cause
     # accuracy issue.
-    if (allow_deep_gemm and use_fp8_w8a8
-            and (is_blackwell_deep_gemm_e8m0_used()
-                 or _valid_deep_gemm(hidden_states, w1, w2))):
+    if (allow_deep_gemm and use_fp8_w8a8 and
+        (is_deep_gemm_e8m0_used() or _valid_deep_gemm(hidden_states, w1, w2))):
         assert apply_router_weight_on_input is False
         assert is_act_and_mul, (
             "DeepGemm only supports is_act_and_mul=True for now.")
diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
index 486ca881df48c..6cd81d97f0298 100644
--- a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
@@ -10,7 +10,7 @@ from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
     DeepGemmExperts, _valid_deep_gemm, _valid_deep_gemm_shape,
     deep_gemm_block_shape)
 from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts
-from vllm.utils.deep_gemm import is_blackwell_deep_gemm_e8m0_used
+from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
 
 
 class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
@@ -107,7 +107,7 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         # Note: the deep gemm workspaces are strictly larger than the triton
         # workspaces so we can be pessimistic here and allocate for DeepGemm
         # even if we fall back to triton later, e.g. if expert maps are set.
-        if self.allow_deep_gemm and (is_blackwell_deep_gemm_e8m0_used()
+        if self.allow_deep_gemm and (is_deep_gemm_e8m0_used()
                                      or _valid_deep_gemm_shape(M, N, K)):
             assert self.deep_gemm_expert is not None
             return self.deep_gemm_expert.workspace_shapes(
@@ -143,7 +143,7 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
     ):
         use_deep_gemm = (self.allow_deep_gemm
                          and (_valid_deep_gemm(hidden_states, w1, w2)
-                              or is_blackwell_deep_gemm_e8m0_used()))
+                              or is_deep_gemm_e8m0_used()))
 
         experts = self.deep_gemm_expert if use_deep_gemm else self.triton_expert
         assert experts is not None
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index d45d368b582df..be358cfa949f0 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -48,8 +48,7 @@ from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 from vllm.utils import has_deep_gemm
-from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_e8m0_used,
-                                  is_deep_gemm_supported)
+from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used, is_deep_gemm_supported
 from vllm.utils.flashinfer import has_flashinfer_moe
 
 if TYPE_CHECKING:
@@ -427,7 +426,7 @@ class Fp8LinearMethod(LinearMethodBase):
         # On B200, if E8M0 for DeepGemm is used, we need to
         # requantize the weight and input to the specific scale
         # at the same time.
-        if is_blackwell_deep_gemm_e8m0_used():
+        if is_deep_gemm_e8m0_used():
             assert layer.weight_block_size is not None
             block_sz = tuple(layer.weight_block_size)
             requant_weight_ue8m0_inplace(
@@ -734,7 +733,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
 
             # DeepGemm scales need to be transposed and aligned.  We try to do
             # it ahead of time for performance reasons.
-            if self.allow_deep_gemm and not is_blackwell_deep_gemm_e8m0_used():
+            if self.allow_deep_gemm and not is_deep_gemm_e8m0_used():
                 # Lazy import to avoid CUDA initialization problems.
                 if _is_col_major(layer.w13_weight_scale_inv):
                     layer.w13_weight_scale_inv = \
@@ -871,7 +870,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             del layer.w13_input_scale
             del layer.w2_input_scale
 
-        if is_blackwell_deep_gemm_e8m0_used():
+        if is_deep_gemm_e8m0_used():
             assert layer.weight_block_size is not None
             # Re-quantise the expert weights so their scales are UE8M0.
             block_sz = tuple(layer.weight_block_size)
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index ab1d5383f4651..7b324dce3c367 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -20,7 +20,7 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils import cdiv, direct_register_custom_op
-from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_e8m0_used,
+from vllm.utils.deep_gemm import (is_deep_gemm_e8m0_used,
                                   should_use_deepgemm_for_fp8_linear)
 
 logger = init_logger(__name__)
@@ -385,7 +385,7 @@ def per_token_group_quant_fp8(
         scaling factor.
     """
     if use_ue8m0 is None:
-        use_ue8m0 = is_blackwell_deep_gemm_e8m0_used()
+        use_ue8m0 = is_deep_gemm_e8m0_used()
     dtype = current_platform.fp8_dtype() if dtype is None else dtype
     assert (x.shape[-1] % group_size == 0), (
         f"the last dimension of `x` {x.shape[-1]} must be divisible "
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 2cd799e5eb5a9..bec792465bfbb 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -501,6 +501,24 @@ def get_config(
 
     if quantization_config is not None:
         config.quantization_config = quantization_config
+        # auto-enable DeepGEMM UE8M0 on Hopper if model config requests it
+        scale_fmt = quantization_config.get("scale_fmt", None)
+        if scale_fmt in ("ue8m0", ):
+            if not envs.is_set("VLLM_USE_DEEP_GEMM_E8M0_HOPPER"):
+                os.environ["VLLM_USE_DEEP_GEMM_E8M0_HOPPER"] = "1"
+                logger.info_once(
+                    ("Detected quantization_config.scale_fmt=%s; "
+                     "enabling Hopper UE8M0."),
+                    scale_fmt,
+                )
+            elif not envs.VLLM_USE_DEEP_GEMM_E8M0_HOPPER:
+                logger.warning_once(
+                    ("Model config requests UE8M0 "
+                     "(quantization_config.scale_fmt=%s), but "
+                     "VLLM_USE_DEEP_GEMM_E8M0_HOPPER=0 is set; "
+                     "Hopper UE8M0 disabled."),
+                    scale_fmt,
+                )
 
     if hf_overrides_kw:
         logger.debug("Overriding HF config with %s", hf_overrides_kw)
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index b0bc3a79eb0ad..cd1dbfb813fee 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -31,34 +31,33 @@ def is_deep_gemm_supported() -> bool:
 
 
 @functools.cache
-def is_blackwell_deep_gemm_e8m0_used() -> bool:
+def is_deep_gemm_e8m0_used() -> bool:
     """Return ``True`` if vLLM is configured to use DeepGEMM "
-    "E8M0 scale on a Blackwell-class GPU.
+    "E8M0 scale on a Hopper or Blackwell-class GPU.
     """
     if not is_deep_gemm_supported():
-        logger.debug_once(
+        logger.info_once(
             "DeepGEMM E8M0 disabled: DeepGEMM not supported on this system.")
         return False
 
-    if not envs.VLLM_USE_DEEP_GEMM_E8M0:
-        logger.debug_once("DeepGEMM E8M0 disabled: VLLM_USE_DEEP_GEMM_E8M0=0.")
-        return False
-
     _lazy_init()
 
     if _fp8_gemm_nt_impl is None:
-        logger.debug_once(
-            "DeepGEMM E8M0 disabled: _fp8_gemm_nt_impl not found")
+        logger.info_once("DeepGEMM E8M0 disabled: _fp8_gemm_nt_impl not found")
         return False
 
-    enabled = (current_platform.is_cuda()
-               and current_platform.has_device_capability(100))
-    if enabled:
-        logger.debug_once("DeepGEMM E8M0 enabled on Blackwell GPU.")
-    else:
-        logger.debug_once(
-            "DeepGEMM E8M0 disabled: not running on Blackwell GPU.")
-    return enabled
+    if current_platform.is_device_capability(100) and \
+            envs.VLLM_USE_DEEP_GEMM_E8M0:
+        logger.info_once("DeepGEMM E8M0 enabled on Blackwell GPU.")
+        return True
+
+    if current_platform.is_device_capability(90) and \
+            envs.VLLM_USE_DEEP_GEMM_E8M0_HOPPER:
+        logger.info_once("DeepGEMM E8M0 enabled on Hopper GPU.")
+        return True
+
+    logger.info_once("DeepGEMM E8M0 disabled on current configuration.")
+    return False
 
 
 def _missing(*_: Any, **__: Any) -> NoReturn:
@@ -124,20 +123,18 @@ def fp8_gemm_nt(*args, **kwargs):
     _lazy_init()
     if _fp8_gemm_nt_impl is None:
         return _missing(*args, **kwargs)
-    return _fp8_gemm_nt_impl(
-        *args,
-        disable_ue8m0_cast=not is_blackwell_deep_gemm_e8m0_used(),
-        **kwargs)
+    return _fp8_gemm_nt_impl(*args,
+                             disable_ue8m0_cast=not is_deep_gemm_e8m0_used(),
+                             **kwargs)
 
 
 def m_grouped_fp8_gemm_nt_contiguous(*args, **kwargs):
     _lazy_init()
     if _grouped_impl is None:
         return _missing(*args, **kwargs)
-    return _grouped_impl(
-        *args,
-        disable_ue8m0_cast=not is_blackwell_deep_gemm_e8m0_used(),
-        **kwargs)
+    return _grouped_impl(*args,
+                         disable_ue8m0_cast=not is_deep_gemm_e8m0_used(),
+                         **kwargs)
 
 
 def fp8_m_grouped_gemm_nt_masked(*args, **kwargs):
@@ -145,9 +142,7 @@ def fp8_m_grouped_gemm_nt_masked(*args, **kwargs):
     if _grouped_masked_impl is None:
         return _missing(*args, **kwargs)
     return _grouped_masked_impl(
-        *args,
-        disable_ue8m0_cast=not is_blackwell_deep_gemm_e8m0_used(),
-        **kwargs)
+        *args, disable_ue8m0_cast=not is_deep_gemm_e8m0_used(), **kwargs)
 
 
 def _ceil_to_ue8m0(x: torch.Tensor):
@@ -211,7 +206,7 @@ __all__ = [
     "m_grouped_fp8_gemm_nt_contiguous",
     "fp8_m_grouped_gemm_nt_masked",
     "per_block_cast_to_fp8",
-    "is_blackwell_deep_gemm_e8m0_used",
+    "is_deep_gemm_e8m0_used",
     "is_deep_gemm_supported",
     "should_use_deepgemm_for_fp8_linear",
 ]

From 841490434aaee4b1c8d8427112af740b6662f384 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Wed, 27 Aug 2025 22:45:17 +0800
Subject: [PATCH 093/112] [Model] Enable native HF format InternVL support
 (#23742)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 docs/models/supported_models.md               |  1 +
 .../multimodal/generation/test_common.py      | 29 +++++++++----------
 tests/models/registry.py                      |  3 +-
 vllm/model_executor/models/registry.py        |  1 +
 4 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 35a5fa0c2e42f..20cf75873af76 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -629,6 +629,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
 | `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + I<sup>E+</sup> + V<sup>E+</sup> | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
 | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ | ✅︎ |
 | `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ |
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index 96208f8eda628..2b60faae8ec0b 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -222,21 +222,6 @@ VLM_TEST_SETTINGS = {
         },
         marks=[large_gpu_mark(min_gb=32)],
     ),
-    # Check "auto" with fallback to transformers
-    "internvl-transformers": VLMTestInfo(
-        models=["OpenGVLab/InternVL3-1B-hf"],
-        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
-        img_idx_to_prompt=lambda idx: "<IMG_CONTEXT>",
-        max_model_len=4096,
-        use_tokenizer_eos=True,
-        image_size_factors=[(0.25, 0.5, 1.0)],
-        vllm_runner_kwargs={
-            "model_impl": "auto",
-        },
-        auto_cls=AutoModelForImageTextToText,
-        marks=[pytest.mark.core_model],
-    ),
     #### Extended model tests
     "aria": VLMTestInfo(
         models=["rhymes-ai/Aria"],
@@ -461,6 +446,20 @@ VLM_TEST_SETTINGS = {
         use_tokenizer_eos=True,
         patch_hf_runner=model_utils.internvl_patch_hf_runner,
     ),
+    "intern_vl-hf": VLMTestInfo(
+        models=["OpenGVLab/InternVL3-1B-hf"],
+        test_type=(
+            VLMTestType.IMAGE,
+            VLMTestType.MULTI_IMAGE,
+            VLMTestType.VIDEO,
+        ),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<IMG_CONTEXT>",
+        video_idx_to_prompt=lambda idx: "<video>",
+        max_model_len=8192,
+        use_tokenizer_eos=True,
+        auto_cls=AutoModelForImageTextToText,
+    ),
     "kimi_vl": VLMTestInfo(
         models=["moonshotai/Kimi-VL-A3B-Instruct"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
diff --git a/tests/models/registry.py b/tests/models/registry.py
index ee546e7af85c6..2538e71692c4e 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -429,6 +429,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                  "3.5-qwen3moe": "OpenGVLab/InternVL3_5-30B-A3B",   # noqa: E501
                                                  "3.5-gptoss": "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview"},  # noqa: E501
                                          trust_remote_code=True),
+    "InternVLForConditionalGeneration": _HfExamplesInfo("OpenGVLab/InternVL3-1B-hf"),    # noqa: E501
     "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
                                                     trust_remote_code=True),
     "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct",  # noqa: E501
@@ -584,7 +585,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
 _TRANSFORMERS_BACKEND_MODELS = {
     "TransformersModel": _HfExamplesInfo("Qwen/Qwen3-Embedding-0.6B"),
     "TransformersForCausalLM": _HfExamplesInfo("hmellor/Ilama-3.2-1B", trust_remote_code=True),  # noqa: E501
-    "TransformersForMultimodalLM": _HfExamplesInfo("OpenGVLab/InternVL3-1B-hf"),
+    "TransformersForMultimodalLM": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
 }
 
 _EXAMPLE_MODELS = {
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 80eac78cdfadb..02ef301a52a43 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -220,6 +220,7 @@ _MULTIMODAL_MODELS = {
     "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
     "InternVLChatModel": ("internvl", "InternVLChatModel"),
     "InternS1ForConditionalGeneration": ("interns1", "InternS1ForConditionalGeneration"),  # noqa: E501
+    "InternVLForConditionalGeneration": ("interns1", "InternS1ForConditionalGeneration"),  # noqa: E501
     "Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"),
     "SmolVLMForConditionalGeneration": ("smolvlm","SmolVLMForConditionalGeneration"),  # noqa: E501
     "KeyeForConditionalGeneration": ("keye", "KeyeForConditionalGeneration"),

From 83f555f637b41a0f533fa1d37b194df6f564ac64 Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Wed, 27 Aug 2025 16:59:34 +0200
Subject: [PATCH 094/112] [Doc]: upgrade version of crate-ci tool for improved
 typo detection (#23755)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 612b290e88d46..c16bdeeecd07a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -21,7 +21,7 @@ repos:
   - id: ruff-format
     files: ^(.buildkite|benchmarks|examples)/.*
 - repo: https://github.com/crate-ci/typos
-  rev: v1.34.0
+  rev: v1.35.5
   hooks:
   - id: typos
 - repo: https://github.com/PyCQA/isort

From 3ce8285d6d96b929fddbb8d29be9ed3b81adcd75 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Wed, 27 Aug 2025 08:11:33 -0700
Subject: [PATCH 095/112] [LogitsProcs] Deduplicate built-in LP implementation
 logic (#23362)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 .../offline_inference/logits_processor.py     |  38 ++---
 tests/v1/logits_processors/utils.py           |  37 ++---
 vllm/v1/sample/logits_processor/builtin.py    | 148 ++++++++----------
 vllm/v1/sample/logits_processor/interface.py  |  15 +-
 4 files changed, 95 insertions(+), 143 deletions(-)

diff --git a/examples/offline_inference/logits_processor.py b/examples/offline_inference/logits_processor.py
index 7ef20efa7d28c..3e122319169eb 100644
--- a/examples/offline_inference/logits_processor.py
+++ b/examples/offline_inference/logits_processor.py
@@ -42,8 +42,8 @@ from vllm.config import VllmConfig
 from vllm.v1.sample.logits_processor import (
     BatchUpdate,
     LogitsProcessor,
-    MoveDirectionality,
 )
+from vllm.v1.sample.logits_processor.builtin import process_dict_updates
 
 
 # Hypothetical custom logits processor
@@ -53,38 +53,22 @@ class DummyLogitsProcessor(LogitsProcessor):
     def __init__(
         self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool
     ):
-        self.req_info: dict[int, SamplingParams] = {}
+        self.req_info: dict[int, int] = {}
 
     def is_argmax_invariant(self) -> bool:
         """Never impacts greedy sampling"""
         return False
 
     def update_state(self, batch_update: Optional[BatchUpdate]):
-        if not batch_update:
-            return
-
-        # Process added requests.
-        for index, params, _, _ in batch_update.added:
-            assert params is not None
-            if params.extra_args and (
-                target_token := params.extra_args.get("target_token")
-            ):
-                self.req_info[index] = target_token
-
-        if self.req_info:
-            # Process removed requests.
-            for index in batch_update.removed:
-                self.req_info.pop(index, None)
-
-            # Process moved requests, unidirectional move (a->b) and swap
-            # (a<->b)
-            for adx, bdx, direct in batch_update.moved:
-                a_val = self.req_info.pop(adx, None)
-                b_val = self.req_info.pop(bdx, None)
-                if a_val is not None:
-                    self.req_info[bdx] = a_val
-                if direct == MoveDirectionality.SWAP and b_val is not None:
-                    self.req_info[adx] = b_val
+        process_dict_updates(
+            self.req_info,
+            batch_update,
+            # This function returns the LP's per-request state based on the
+            # request details, or None if this LP does not apply to the
+            # request.
+            lambda params, _, __: params.extra_args
+            and (params.extra_args.get("target_token")),
+        )
 
     def apply(self, logits: torch.Tensor) -> torch.Tensor:
         if not self.req_info:
diff --git a/tests/v1/logits_processors/utils.py b/tests/v1/logits_processors/utils.py
index c0bfc1a18feca..c36f1bd021c70 100644
--- a/tests/v1/logits_processors/utils.py
+++ b/tests/v1/logits_processors/utils.py
@@ -8,10 +8,9 @@ from typing import Optional
 import torch
 
 from vllm.config import VllmConfig
-from vllm.sampling_params import SamplingParams
 from vllm.v1.sample.logits_processor import (LOGITSPROCS_GROUP, BatchUpdate,
-                                             LogitsProcessor,
-                                             MoveDirectionality)
+                                             LogitsProcessor)
+from vllm.v1.sample.logits_processor.builtin import process_dict_updates
 
 MODEL_NAME = "facebook/opt-125m"
 POOLING_MODEL_NAME = "BAAI/bge-base-en-v1.5"
@@ -45,37 +44,19 @@ class DummyLogitsProcessor(LogitsProcessor):
 
     def __init__(self, vllm_config: "VllmConfig", device: torch.device,
                  is_pin_memory: bool):
-        self.req_info: dict[int, SamplingParams] = {}
+        self.req_info: dict[int, int] = {}
 
     def is_argmax_invariant(self) -> bool:
         """Never impacts greedy sampling"""
         return False
 
     def update_state(self, batch_update: Optional[BatchUpdate]):
-        if not batch_update:
-            return
-
-        # Process added requests.
-        for index, params, _, _ in batch_update.added:
-            assert params is not None
-            if params.extra_args and (target_token :=
-                                      params.extra_args.get("target_token")):
-                self.req_info[index] = target_token
-
-        if self.req_info:
-            # Process removed requests.
-            for index in batch_update.removed:
-                self.req_info.pop(index, None)
-
-            # Process moved requests, unidirectional move (a->b) and swap
-            # (a<->b)
-            for adx, bdx, direct in batch_update.moved:
-                a_val = self.req_info.pop(adx, None)
-                b_val = self.req_info.pop(bdx, None)
-                if a_val is not None:
-                    self.req_info[bdx] = a_val
-                if direct == MoveDirectionality.SWAP and b_val is not None:
-                    self.req_info[adx] = b_val
+        process_dict_updates(
+            self.req_info,
+            batch_update,
+            lambda params, _, __: params.extra_args and
+            (params.extra_args.get("target_token")),
+        )
 
     def apply(self, logits: torch.Tensor) -> torch.Tensor:
         if not self.req_info:
diff --git a/vllm/v1/sample/logits_processor/builtin.py b/vllm/v1/sample/logits_processor/builtin.py
index 00dd757489ca0..60f9c0bdb6313 100644
--- a/vllm/v1/sample/logits_processor/builtin.py
+++ b/vllm/v1/sample/logits_processor/builtin.py
@@ -1,10 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Sequence
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, Callable, Optional, TypeVar
 
 import torch
 
+from vllm import SamplingParams
 from vllm.v1.sample.logits_processor.interface import (BatchUpdate,
                                                        LogitsProcessor,
                                                        MoveDirectionality)
@@ -12,6 +13,8 @@ from vllm.v1.sample.logits_processor.interface import (BatchUpdate,
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
 
+T = TypeVar("T")
+
 
 class MinPLogitsProcessor(LogitsProcessor):
 
@@ -130,49 +133,15 @@ class LogitBiasLogitsProcessor(LogitsProcessor):
         return False
 
     def update_state(self, batch_update: Optional[BatchUpdate]):
-        if not batch_update:
-            return
-
-        needs_update: bool = False
-        # Process added requests.
-        for index, params, _, _ in batch_update.added:
-            if lb := params.logit_bias:
-                self.biases[index] = lb
-                needs_update = True
-            else:
-                # Drop biases metadata at batch index
-                if self.biases.pop(index, None) is not None:
-                    # If a new request replaces an old request which
-                    # specified biases, we should update processor tensors
-                    needs_update = True
-
-        if self.biases:
-            # Process removed requests.
-            for index in batch_update.removed:
-                if self.biases.pop(index, None):
-                    needs_update = True
-
-            # Process moved requests, unidirectional (a->b) and swap (a<->b)
-            for a_index, b_index, direct in batch_update.moved:
-                if direct == MoveDirectionality.UNIDIRECTIONAL:
-                    if (a_entry := self.biases.pop(a_index, None)) is None:
-                        if self.biases.pop(b_index, None) is not None:
-                            needs_update = True
-                    else:
-                        self.biases[b_index] = a_entry
-                        needs_update = True
-                else:
-                    a_entry = self.biases.pop(a_index, None)
-                    if (b_entry := self.biases.pop(b_index, None)) is not None:
-                        self.biases[a_index] = b_entry
-                        needs_update = True
-                    if a_entry is not None:
-                        self.biases[b_index] = a_entry
-                        needs_update = True
+        needs_update = process_dict_updates(
+            self.biases, batch_update,
+            lambda params, _, __: params.logit_bias or None)
 
         # Update tensors if needed.
         if needs_update:
-            reqs, tok_ids, biases = [], [], []
+            reqs: list[int] = []
+            tok_ids: list[int] = []
+            biases: list[float] = []
             for req, lb in self.biases.items():
                 reqs.extend([req] * len(lb))
                 tok_ids.extend(lb.keys())
@@ -216,52 +185,18 @@ class MinTokensLogitsProcessor(LogitsProcessor):
         of the argmax operation in greedy sampling."""
         return False
 
+    @staticmethod
+    def add_request(
+        params: SamplingParams, _: list[int], output_tok_ids: list[int]
+    ) -> Optional[tuple[int, Sequence[int], set[int]]]:
+        min_tokens = params.min_tokens
+        if not min_tokens or len(output_tok_ids) >= min_tokens:
+            return None
+        return min_tokens, output_tok_ids, params.all_stop_token_ids
+
     def update_state(self, batch_update: Optional[BatchUpdate]):
-        needs_update = False
-
-        if batch_update:
-            # Process added requests.
-            for index, params, _, output_tok_ids in batch_update.added:
-                if ((min_tokens := params.min_tokens)
-                        and len(output_tok_ids) < min_tokens):
-                    # Replace request metadata at batch index
-                    self.min_toks[index] = (min_tokens, output_tok_ids,
-                                            params.all_stop_token_ids)
-                    needs_update = True
-                else:
-                    # Drop min_toks metadata at batch index
-                    if self.min_toks.pop(index, None) is not None:
-                        # If a new request replaces an old request which
-                        # specified min_toks, we should update processor tensors
-                        needs_update = True
-
-            if self.min_toks:
-                # Process removed requests.
-                for index in batch_update.removed:
-                    if self.min_toks.pop(index, None):
-                        needs_update = True
-
-                # Process moved requests, unidirectional (a->b) and
-                # swapped (a<->b)
-                for a_index, b_index, direct in batch_update.moved:
-                    if direct == MoveDirectionality.UNIDIRECTIONAL:
-                        if (a_entry := self.min_toks.pop(a_index,
-                                                         None)) is None:
-                            if self.min_toks.pop(b_index, None) is not None:
-                                needs_update = True
-                        else:
-                            self.min_toks[b_index] = a_entry
-                            needs_update = True
-                    else:
-                        a_entry = self.min_toks.pop(a_index, None)
-                        if (b_entry := self.min_toks.pop(b_index,
-                                                         None)) is not None:
-                            self.min_toks[a_index] = b_entry
-                            needs_update = True
-                        if a_entry is not None:
-                            self.min_toks[b_index] = a_entry
-                            needs_update = True
-
+        needs_update = process_dict_updates(self.min_toks, batch_update,
+                                            self.add_request)
         if self.min_toks:
             # Check for any requests that have attained their min tokens.
             to_remove = tuple(index for index, (min_toks, out_tok_ids,
@@ -295,3 +230,44 @@ class MinTokensLogitsProcessor(LogitsProcessor):
             # Inhibit EOS token for requests which have not reached min length
             logits[self.logits_slice] = -float("inf")
         return logits
+
+
+def process_dict_updates(
+    req_entries: dict[int, T], batch_update: Optional[BatchUpdate],
+    new_state: Callable[[SamplingParams, list[int], list[int]], Optional[T]]
+) -> bool:
+    """Utility function to update dict state for sparse LogitsProcessors."""
+
+    if not batch_update:
+        # Nothing to do.
+        return False
+
+    updated = False
+    for index, params, prompt_tok_ids, output_tok_ids in batch_update.added:
+        if (state := new_state(params, prompt_tok_ids,
+                               output_tok_ids)) is not None:
+            req_entries[index] = state
+            updated = True
+        elif req_entries.pop(index, None) is not None:
+            updated = True
+
+    if req_entries:
+        # Process removed requests.
+        for index in batch_update.removed:
+            if req_entries.pop(index, None):
+                updated = True
+
+        # Process moved requests, unidirectional (a->b) and
+        # swapped (a<->b)
+        for a_index, b_index, direct in batch_update.moved:
+            a_entry = req_entries.pop(a_index, None)
+            b_entry = req_entries.pop(b_index, None)
+            if a_entry is not None:
+                req_entries[b_index] = a_entry
+                updated = True
+            if b_entry is not None:
+                updated = True
+                if direct == MoveDirectionality.SWAP:
+                    req_entries[a_index] = b_entry
+
+    return updated
diff --git a/vllm/v1/sample/logits_processor/interface.py b/vllm/v1/sample/logits_processor/interface.py
index 12b4db24bff88..16cd00943db8d 100644
--- a/vllm/v1/sample/logits_processor/interface.py
+++ b/vllm/v1/sample/logits_processor/interface.py
@@ -44,10 +44,16 @@ class BatchUpdate:
     # Key assumption: the `output_tok_ids` list (which is an element of each
     # tuple in `added`) is a reference to the request's running output tokens
     # list; via this reference, the logits processors always see the latest
-    # list of generated output tokens
+    # list of generated output tokens.
+    #
+    # NOTE:
+    # * Added or moved requests may replace existing requests with the same
+    #   index.
+    # * Operations should be processed in the following order:
+    #   - removed, added, moved
     removed: Sequence[RemovedRequest]
-    moved: Sequence[MovedRequest]
     added: Sequence[AddedRequest]
+    moved: Sequence[MovedRequest]
 
 
 class LogitsProcessor(ABC):
@@ -59,6 +65,11 @@ class LogitsProcessor(ABC):
 
     @abstractmethod
     def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        """Apply LogitsProcessor to batch logits tensor.
+
+        The updated tensor must be returned but may be
+        modified in-place.
+        """
         raise NotImplementedError
 
     @abstractmethod

From 2b61d2e22fbcfd6c9df9cdf06f5905b311c2ca18 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 27 Aug 2025 17:22:21 +0100
Subject: [PATCH 096/112] [Docs] Remove in-tree Gaudi install instructions
 (#23628)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/getting_started/installation/README.md   |   1 -
 .../installation/intel_gaudi.md               | 388 ------------------
 2 files changed, 389 deletions(-)
 delete mode 100644 docs/getting_started/installation/intel_gaudi.md

diff --git a/docs/getting_started/installation/README.md b/docs/getting_started/installation/README.md
index 0ee680f5c688c..8a658b7a9103f 100644
--- a/docs/getting_started/installation/README.md
+++ b/docs/getting_started/installation/README.md
@@ -12,7 +12,6 @@ vLLM supports the following hardware platforms:
     - [Apple silicon](cpu.md#apple-silicon)
     - [IBM Z (S390X)](cpu.md#ibm-z-s390x)
 - [Google TPU](google_tpu.md)
-- [Intel Gaudi](intel_gaudi.md)
 - [AWS Neuron](aws_neuron.md)
 
 ## Hardware Plugins
diff --git a/docs/getting_started/installation/intel_gaudi.md b/docs/getting_started/installation/intel_gaudi.md
deleted file mode 100644
index ff912efec9ca8..0000000000000
--- a/docs/getting_started/installation/intel_gaudi.md
+++ /dev/null
@@ -1,388 +0,0 @@
-# Intel Gaudi
-
-This page provides instructions on running vLLM with Intel Gaudi devices.
-
-!!! warning
-    There are no pre-built wheels or images for this device, so you must build vLLM from source.
-
-## Requirements
-
-- OS: Ubuntu 22.04 LTS
-- Python: 3.10
-- Intel Gaudi accelerator
-- Intel Gaudi software version 1.18.0
-
-Please follow the instructions provided in the
-[Gaudi Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html)
-to set up the execution environment. To achieve the best performance,
-please follow the methods outlined in the
-[Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).
-
-## Configure a new environment
-
-### Environment verification
-
-To verify that the Intel Gaudi software was correctly installed, run:
-
-```bash
-hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
-apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed
-pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
-pip list | grep neural # verify that neural_compressor_pt is installed
-```
-
-Refer to [Intel Gaudi Software Stack Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade)
-for more details.
-
-### Run Docker Image
-
-It is highly recommended to use the latest Docker image from Intel Gaudi
-vault. Refer to the [Intel Gaudi documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers)
-for more details.
-
-Use the following commands to run a Docker image:
-
-```bash
-docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
-docker run \
-  -it \
-  --runtime=habana \
-  -e HABANA_VISIBLE_DEVICES=all \
-  -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
-  --cap-add=sys_nice \
-  --net=host \
-  --ipc=host \
-  vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
-```
-
-## Set up using Python
-
-### Pre-built wheels
-
-Currently, there are no pre-built Intel Gaudi wheels.
-
-### Build wheel from source
-
-To build and install vLLM from source, run:
-
-```bash
-git clone https://github.com/vllm-project/vllm.git
-cd vllm
-pip install -r requirements/hpu.txt
-python setup.py develop
-```
-
-Currently, the latest features and performance optimizations are developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and we periodically upstream them to vLLM main repo. To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following:
-
-```bash
-git clone https://github.com/HabanaAI/vllm-fork.git
-cd vllm-fork
-git checkout habana_main
-pip install -r requirements/hpu.txt
-python setup.py develop
-```
-
-## Set up using Docker
-
-### Pre-built images
-
-Currently, there are no pre-built Intel Gaudi images.
-
-### Build image from source
-
-```bash
-docker build -f docker/Dockerfile.hpu -t vllm-hpu-env  .
-docker run \
-  -it \
-  --runtime=habana \
-  -e HABANA_VISIBLE_DEVICES=all \
-  -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
-  --cap-add=sys_nice \
-  --net=host \
-  --rm vllm-hpu-env
-```
-
-!!! tip
-    If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered.
-
-## Extra information
-
-### Supported features
-
-- [Offline inference](../../serving/offline_inference.md)
-- Online serving via [OpenAI-Compatible Server](../../serving/openai_compatible_server.md)
-- HPU autodetection - no need to manually select device within vLLM
-- Paged KV cache with algorithms enabled for Intel Gaudi accelerators
-- Custom Intel Gaudi implementations of Paged Attention, KV cache ops,
-  prefill attention, Root Mean Square Layer Normalization, Rotary
-  Positional Encoding
-- Tensor parallelism support for multi-card inference
-- Inference with [HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html)
-  for accelerating low-batch latency and throughput
-- Attention with Linear Biases (ALiBi)
-- INC quantization
-
-### Unsupported features
-
-- Beam search
-- LoRA adapters
-- AWQ quantization
-- Prefill chunking (mixed-batch inferencing)
-
-### Supported configurations
-
-The following configurations have been validated to function with
-Gaudi2 devices. Configurations that are not listed may or may not work.
-
-| Model | TP Size| dtype | Sampling |
-|-------|--------|--------|----------|
-| [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b) | 1, 2, 8 | BF16 | Random / Greedy |
-| [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | 1, 2, 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) | 1, 2, 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | 1, 2, 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) | 1, 2, 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) | 1, 2, 8 | BF16 | Random / Greedy |
-| [meta-llama/Llama-2-70b](https://huggingface.co/meta-llama/Llama-2-70b) | 8 | BF16 | Random / Greedy |
-| [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B) | 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) | 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3.1-70B](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B) | 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) | 8 | BF16 | Random / Greedy |
-
-## Performance tuning
-
-### Execution modes
-
-Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment variable), and `--enforce-eager` flag.
-
-|   `PT_HPU_LAZY_MODE` |   `enforce_eager` | execution mode     |
-|----------------------|-------------------|--------------------|
-|                    0 |                 0 | torch.compile      |
-|                    0 |                 1 | PyTorch eager mode |
-|                    1 |                 0 | HPU Graphs         |
-
-!!! warning
-    In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode.
-
-[](){ #gaudi-bucketing-mechanism }
-
-### Bucketing mechanism
-
-Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution.
-In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - `batch_size` and `sequence_length`.
-
-!!! note
-    Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase.
-
-Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup:
-
-```text
-INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
-INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
-INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
-INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
-```
-
-| Parameter      | Description                                                                 |
-|----------------|-----------------------------------------------------------------------------|
-| `min`          | Determines the lowest value of the bucket.                                  |
-| `step`         | Determines the interval between buckets.                                     |
-| `max`          | Determines the upper bound of the bucket.                                    |
-| Ramp-up phase  | A special handling phase applied between `min` and `step`:<br/>- `min` is multiplied by consecutive powers of two until `step` is reached.<br/>- Minimizes resource wastage for small batch sizes.<br/>- Allows larger padding for larger batches. |
-
-Example (with ramp-up):
-
-```text
-min = 2, step = 32, max = 64
-=> ramp_up = (2, 4, 8, 16)
-=> stable = (32, 64)
-=> buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64)
-```
-
-Example (without ramp-up):
-
-```text
-min = 128, step = 128, max = 512
-=> ramp_up = ()
-=> stable = (128, 256, 384, 512)
-=> buckets = ramp_up + stable => (128, 256, 384, 512)
-```
-
-In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket.
-
-!!! warning
-    If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario.
-
-As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as `(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as `(4, 512)` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a `(2, 512)` bucket, or context length increases above 512 tokens, in which case it will become `(4, 640)` bucket.
-
-!!! note
-    Bucketing is transparent to a client -- padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
-
-### Warmup
-
-Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup:
-
-??? console "Logs"
-
-    ```text
-    INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB
-    INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB
-    INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB
-    ...
-    INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
-    INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB
-    INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB
-    INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB
-    ...
-    INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB
-    INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
-    ```
-
-This example uses the same buckets as in the [Bucketing Mechanism][gaudi-bucketing-mechanism] section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations.
-
-!!! tip
-    Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment.
-
-### HPU Graph capture
-
-[HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management.
-
-When HPU Graphs are being used, they share the common memory pool ("usable memory") as KV cache, determined by `gpu_memory_utilization` flag (`0.9` by default).
-Before KV cache gets allocated, model weights are loaded onto the device, and a forward pass of the model is executed on dummy data, to estimate memory usage.
-Only after that, `gpu_memory_utilization` flag is utilized - at its default value, will mark 90% of free device memory at that point as usable.
-Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured.
-Environment variable `VLLM_GRAPH_RESERVED_MEM` defines the ratio of memory reserved for HPU Graphs capture.
-With its default value (`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 90% will be utilized for KV cache.
-Environment variable `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (`VLLM_GRAPH_PROMPT_RATIO=0.3`), both stages have equal memory constraints.
-Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. `VLLM_GRAPH_PROMPT_RATIO=0.2` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs.
-
-!!! note
-    `gpu_memory_utilization` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, `gpu_memory_utilization` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory.
-
-User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented:
-
-- `max_bs` - graph capture queue will be sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), default strategy for decode
-- `min_tokens` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (`batch_size*sequence_length`), default strategy for prompt
-
-When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by `max_bs` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in `min_tokens` strategy.
-
-!!! note
-    `VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * `VLLM_GRAPH_PROMPT_RATIO`) for capturing prefill HPU Graphs, next it will attempt to do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below.
-
-Each described step is logged by vLLM server, as follows (negative values correspond to memory being released):
-
-??? console "Logs"
-
-    ```text
-    INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
-    INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
-    INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
-    INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
-    INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
-    INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used)
-    INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
-    INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used)
-    INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache
-    INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0
-    INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used)
-    INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB
-    ...
-    INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
-    INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3)
-    INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
-    ...
-    INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB
-    INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB
-    ...
-    INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB
-    INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB
-    INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB
-    INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB
-    INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB
-    INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)]
-    INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
-    INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory
-    INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used)
-    ```
-
-### Recommended vLLM Parameters
-
-- We recommend running inference on Gaudi 2 with `block_size` of 128
-  for BF16 data type. Using default values (16, 32) might lead to
-  sub-optimal performance due to Matrix Multiplication Engine
-  under-utilization (see [Gaudi Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)).
-- For max throughput on Llama 7B, we recommend running with batch size
-  of 128 or 256 and max context length of 2048 with HPU Graphs enabled.
-  If you encounter out-of-memory issues, see troubleshooting section.
-
-### Environment variables
-
-**Diagnostic and profiling knobs:**
-
-- `VLLM_PROFILER_ENABLED`: If `true`, enable the high level profiler. Resulting JSON traces can be viewed in [perfetto.habana.ai](https://perfetto.habana.ai/#!/viewer). `false` by default.
-- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION`: If `true`, log graph compilations for each vLLM engine step when any occurs. Highly recommended to use with `PT_HPU_METRICS_GC_DETAILS=1`. `false` by default.
-- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: If `true`, always log graph compilations for each vLLM engine step even if none occurred. `false` by default.
-- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: If `true`, log CPU fallbacks for each vLLM engine step when any occurs. `false` by default.
-- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true`, always log CPU fallbacks for each vLLM engine step even if none occurred. `false` by default.
-
-**Performance tuning knobs:**
-
-- `VLLM_SKIP_WARMUP`: if `true`, warmup will be skipped, `false` by default
-
-- `VLLM_GRAPH_RESERVED_MEM`: percentage of memory dedicated for HPUGraph capture, `0.1` by default
-
-- `VLLM_GRAPH_PROMPT_RATIO`: percentage of reserved graph memory dedicated for prompt graphs, `0.3` by default
-
-- `VLLM_GRAPH_PROMPT_STRATEGY`: strategy determining order of prompt graph capture, `min_tokens` or `max_bs`, `min_tokens` by default
-
-- `VLLM_GRAPH_DECODE_STRATEGY`: strategy determining order of decode graph capture, `min_tokens` or `max_bs`, `max_bs` by default
-
-- `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment variables configuring ranges of bucketing mechanism
-
-    - `{phase}` is either `PROMPT` or `DECODE`
-
-    - `{dim}` is either `BS`, `SEQ` or `BLOCK`
-
-    - `{param}` is either `MIN`, `STEP` or `MAX`
-
-    - Default values:
-
-| `{phase}` | Parameter | Env Variable | Value Expression |
-|-----------|-----------|--------------|------------------|
-| Prompt | Batch size min | `VLLM_PROMPT_BS_BUCKET_MIN` | `1` |
-| Prompt | Batch size step | `VLLM_PROMPT_BS_BUCKET_STEP` | `min(max_num_seqs, 32)` |
-| Prompt | Batch size max | `VLLM_PROMPT_BS_BUCKET_MAX` | `min(max_num_seqs, 64)` |
-| Prompt | Sequence length min | `VLLM_PROMPT_SEQ_BUCKET_MIN` | `block_size` |
-| Prompt | Sequence length step | `VLLM_PROMPT_SEQ_BUCKET_STEP` | `block_size` |
-| Prompt | Sequence length max | `VLLM_PROMPT_SEQ_BUCKET_MAX` | `max_model_len` |
-| Decode | Batch size min | `VLLM_DECODE_BS_BUCKET_MIN` | `1` |
-| Decode | Batch size step | `VLLM_DECODE_BS_BUCKET_STEP` | `min(max_num_seqs, 32)` |
-| Decode | Batch size max | `VLLM_DECODE_BS_BUCKET_MAX` | `max_num_seqs` |
-| Decode | Sequence length min | `VLLM_DECODE_BLOCK_BUCKET_MIN` | `block_size` |
-| Decode | Sequence length step | `VLLM_DECODE_BLOCK_BUCKET_STEP` | `block_size` |
-| Decode | Sequence length max | `VLLM_DECODE_BLOCK_BUCKET_MAX` | `max(128, (max_num_seqs*max_model_len)/block_size)` |
-
-Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution:
-
-- `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used; if `1`, PyTorch Lazy backend for Gaudi will be used. `1` is default.
-- `PT_HPU_ENABLE_LAZY_COLLECTIVES`: required to be `true` for tensor parallel inference with HPU Graphs
-
-## Troubleshooting: tweaking HPU graphs
-
-If you experience device out-of-memory issues or want to attempt
-inference at higher batch sizes, try tweaking HPU Graphs by following
-the below:
-
-- Tweak `gpu_memory_utilization` knob. It will decrease the
-  allocation of KV cache, leaving some headroom for capturing graphs
-  with larger batch size. By default `gpu_memory_utilization` is set
-  to 0.9. It attempts to allocate ~90% of HBM left for KV cache after
-  short profiling run. Note that decreasing reduces the number of KV
-  cache blocks you have available, and therefore reduces the effective
-  maximum number of tokens you can handle at a given time.
-- If this method is not efficient, you can disable `HPUGraph`
-  completely. With HPU Graphs disabled, you are trading latency and
-  throughput at lower batches for potentially higher throughput on
-  higher batches. You can do that by adding `--enforce-eager` flag to
-  server (for online serving), or by passing `enforce_eager=True`
-  argument to LLM constructor (for offline inference).

From 4f35be10a96feeca0328d3ab8d359e1eaae5c23d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luka=20Govedi=C4=8D?=
 <ProExpertProg@users.noreply.github.com>
Date: Wed, 27 Aug 2025 12:47:28 -0400
Subject: [PATCH 097/112] [BugFix] Fix topk_softmax assert (#19764)

Signed-off-by: Luka Govedic <lgovedic@redhat.com>
---
 csrc/moe/topk_softmax_kernels.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu
index 99c52ef17d08b..cd80bfda7dfde 100644
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@@ -573,7 +573,7 @@ void topk_softmax(
             stream);
     }
     else {
-        assert(topk_indices.scalar_type() == at::ScalarType::Int64);
+        TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long);
         vllm::moe::topkGatingSoftmaxKernelLauncher(
             gating_output.data_ptr<float>(),
             topk_weights.data_ptr<float>(),

From 52883ed08461943ff55d5dd3cf12a28c00902fa7 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 28 Aug 2025 01:01:50 +0800
Subject: [PATCH 098/112] [Model] Merge `SupportsMultiModalWithRawInput` with
 `SupportsMultiModal` (#23749)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/config/__init__.py                       |  8 ++--
 vllm/model_executor/models/interfaces.py      | 45 +++++--------------
 .../models/prithvi_geospatial_mae.py          |  6 +--
 vllm/model_executor/models/registry.py        | 11 ++---
 vllm/v1/worker/gpu_model_runner.py            | 10 +++--
 5 files changed, 30 insertions(+), 50 deletions(-)

diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index e3fb6d796def5..351833d3f02d0 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -1698,6 +1698,10 @@ class ModelConfig:
     def is_multimodal_model(self) -> bool:
         return self.multimodal_config is not None
 
+    @property
+    def is_multimodal_raw_input_only_model(self) -> bool:
+        return self._model_info.supports_multimodal_raw_input_only
+
     @property
     def is_cross_encoder(self) -> bool:
         return (self._model_info.supports_cross_encoding
@@ -1707,10 +1711,6 @@ class ModelConfig:
     def is_pp_supported(self) -> bool:
         return self._model_info.supports_pp
 
-    @property
-    def is_multimodal_raw_input_supported(self) -> bool:
-        return self._model_info.supports_multimodal_raw_input
-
     @property
     def is_attention_free(self) -> bool:
         return self._model_info.is_attention_free
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 506732fed3614..2ee966fb5c0c8 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -52,6 +52,12 @@ class SupportsMultiModal(Protocol):
         MRO of your model class.
     """
 
+    supports_multimodal_raw_input_only: ClassVar[bool] = False
+    """
+    A flag that indicates this model supports multi-modal inputs and processes
+    them in their raw form and not embeddings.
+    """
+
     supports_encoder_tp_data: ClassVar[bool] = False
     """
     A flag that indicates whether this model supports
@@ -143,45 +149,16 @@ def supports_multimodal(
     return getattr(model, "supports_multimodal", False)
 
 
+def supports_multimodal_raw_input_only(
+        model: Union[type[object], object]) -> bool:
+    return getattr(model, "supports_multimodal_raw_input_only", False)
+
+
 def supports_multimodal_encoder_tp_data(
         model: Union[type[object], object]) -> bool:
     return getattr(model, "supports_encoder_tp_data", False)
 
 
-@runtime_checkable
-class SupportsMultiModalWithRawInput(SupportsMultiModal, Protocol):
-    """The interface required for all multi-modal models."""
-
-    supports_multimodal_raw_input: ClassVar[Literal[True]] = True
-    """
-    A flag that indicates this model supports multi-modal inputs and processes
-    them in their raw form and not embeddings.
-
-    Note:
-        There is no need to redefine this flag if this class is in the
-        MRO of your model class.
-    """
-
-
-@overload
-def supports_multimodal_raw_input(
-        model: object) -> TypeIs[SupportsMultiModalWithRawInput]:
-    ...
-
-
-@overload
-def supports_multimodal_raw_input(
-        model: type[object]) -> TypeIs[type[SupportsMultiModalWithRawInput]]:
-    ...
-
-
-def supports_multimodal_raw_input(
-    model: Union[type[object], object]
-) -> Union[TypeIs[type[SupportsMultiModalWithRawInput]],
-           TypeIs[SupportsMultiModalWithRawInput]]:
-    return getattr(model, "supports_multimodal_raw_input", False)
-
-
 @runtime_checkable
 class SupportsScoreTemplate(Protocol):
     """The interface required for all models that support score template."""
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
index f46d6375e1f61..2d14fe6d5892f 100644
--- a/vllm/model_executor/models/prithvi_geospatial_mae.py
+++ b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -41,7 +41,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import (IsAttentionFree, MultiModalEmbeddings,
-                         SupportsMultiModalWithRawInput)
+                         SupportsMultiModal)
 from .interfaces_base import default_pooling_type
 
 
@@ -174,10 +174,10 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
     info=PrithviGeoSpatialMAEProcessingInfo,
     dummy_inputs=PrithviGeoSpatialMAEInputBuilder,
 )
-class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree,
-                           SupportsMultiModalWithRawInput):
+class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree, SupportsMultiModal):
     """Prithvi Masked Autoencoder"""
 
+    supports_multimodal_raw_input_only = True
     is_pooling_model = True
 
     @classmethod
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 02ef301a52a43..12c0c77784db8 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -29,7 +29,7 @@ from .interfaces import (has_inner_state, has_noops, is_attention_free,
                          is_hybrid, supports_cross_encoding,
                          supports_multimodal,
                          supports_multimodal_encoder_tp_data,
-                         supports_multimodal_raw_input, supports_pp,
+                         supports_multimodal_raw_input_only, supports_pp,
                          supports_transcription, supports_v0_only)
 from .interfaces_base import (get_default_pooling_type, is_pooling_model,
                               is_text_generation_model)
@@ -326,7 +326,7 @@ class _ModelInfo:
     default_pooling_type: str
     supports_cross_encoding: bool
     supports_multimodal: bool
-    supports_multimodal_raw_input: bool
+    supports_multimodal_raw_input_only: bool
     supports_multimodal_encoder_tp_data: bool
     supports_pp: bool
     has_inner_state: bool
@@ -346,7 +346,8 @@ class _ModelInfo:
             default_pooling_type=get_default_pooling_type(model),
             supports_cross_encoding=supports_cross_encoding(model),
             supports_multimodal=supports_multimodal(model),
-            supports_multimodal_raw_input=supports_multimodal_raw_input(model),
+            supports_multimodal_raw_input_only=
+            supports_multimodal_raw_input_only(model),
             supports_multimodal_encoder_tp_data=
             supports_multimodal_encoder_tp_data(model),
             supports_pp=supports_pp(model),
@@ -743,13 +744,13 @@ class _ModelRegistry:
         model_cls, _ = self.inspect_model_cls(architectures, model_config)
         return model_cls.supports_multimodal
 
-    def supports_multimodal_raw_input(
+    def is_multimodal_raw_input_only_model(
         self,
         architectures: Union[str, list[str]],
         model_config: ModelConfig,
     ) -> bool:
         model_cls, _ = self.inspect_model_cls(architectures, model_config)
-        return model_cls.supports_multimodal_raw_input
+        return model_cls.supports_multimodal_raw_input_only
 
     def is_pp_supported_model(
         self,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index d93460d618e7c..20d2d20ba0967 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -139,8 +139,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 cache_config.cache_dtype]
 
         self.is_pooling_model = model_config.pooler_config is not None
-        self.is_multimodal_raw_input_supported = (
-            model_config.is_multimodal_raw_input_supported)
+        self.is_multimodal_raw_input_only_model = (
+            model_config.is_multimodal_raw_input_only_model)
+
         self.max_model_len = model_config.max_model_len
         self.max_num_tokens = scheduler_config.max_num_batched_tokens
         self.max_num_reqs = scheduler_config.max_num_seqs
@@ -612,7 +613,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self,
         scheduler_output: "SchedulerOutput",
     ) -> BatchedTensorInputs:
-        if not self.is_multimodal_raw_input_supported or not scheduler_output:  # noqa: SIM102
+        if not scheduler_output or not self.is_multimodal_raw_input_only_model:
             return {}
 
         mm_kwargs = list[MultiModalKwargsItem]()
@@ -631,8 +632,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         return mm_kwargs_combined
 
     def _dummy_mm_kwargs(self, num_seqs: int) -> BatchedTensorInputs:
-        if not self.is_multimodal_raw_input_supported:
+        if not self.is_multimodal_raw_input_only_model:
             return {}
+
         mm_budget = self.mm_budget
         assert mm_budget is not None
 

From dd589322801e2eb8426aa2b95f2729699ff431c5 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Wed, 27 Aug 2025 19:05:16 +0200
Subject: [PATCH 099/112] [V1] [Hybrid] Enable compile and piecewise CUDA graph
 for MiniMax-Text models (#22589)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 vllm/config/compilation.py                    |   1 +
 vllm/model_executor/models/minimax_text_01.py | 234 ++++++++----------
 2 files changed, 98 insertions(+), 137 deletions(-)

diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 56aa00a30d3ae..5c3b220016360 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -339,6 +339,7 @@ class CompilationConfig:
         "vllm.mamba_mixer2",
         "vllm.mamba_mixer",
         "vllm.short_conv",
+        "vllm.linear_attention",
     ]
 
     def compute_hash(self) -> str:
diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py
index 0e854bd7d913d..176a40179bcac 100644
--- a/vllm/model_executor/models/minimax_text_01.py
+++ b/vllm/model_executor/models/minimax_text_01.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only MiniMaxText01 model."""
-import copy
 import math
 from collections.abc import Iterable
 from typing import TYPE_CHECKING, Optional, Union
@@ -19,13 +18,14 @@ from transformers import MiniMaxConfig
 
 from vllm import envs
 from vllm.attention import Attention, AttentionMetadata
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (CacheConfig, ModelConfig, VllmConfig,
                          get_current_vllm_config)
 from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
 from vllm.distributed.parallel_state import (
     get_pp_group, get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size)
-from vllm.forward_context import get_forward_context
+from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import FusedMoE
@@ -43,12 +43,15 @@ from vllm.model_executor.layers.mamba.mamba_utils import (
     MambaStateDtypeCalculator, MambaStateShapeCalculator)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.utils import maybe_prefix
 from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
+from vllm.utils import direct_register_custom_op
 from vllm.v1.attention.backends.linear_attn import LinearAttentionMetadata
 
 from .interfaces import HasInnerState, IsHybrid
@@ -143,61 +146,6 @@ class MiniMaxText01RMSNormTP(CustomOp):
         return self._forward(x)
 
 
-class MiniMaxText01RotaryEmbedding(CustomOp):
-    name = "MiniMaxText01RotaryEmbedding"
-
-    def __init__(
-        self,
-        head_size: int,
-        rotary_dim: int,
-        max_position: int,
-        base: float,
-        is_neox_style: bool,
-        cache_dtype: torch.dtype,
-    ) -> None:
-        super().__init__()
-        self.head_size = head_size
-        self.rotary_dim = rotary_dim
-        self.max_position_embeddings = max_position
-        self.base = base
-        self.is_neox_style = is_neox_style
-        self.cache_dtype = cache_dtype
-        cache = self._compute_cos_sin_cache().to(cache_dtype)
-        self.register_buffer("cos_sin_cache", cache, persistent=False)
-
-    def _compute_inv_freq(self, base: float) -> torch.Tensor:
-        """Compute the inverse frequency."""
-        inv_freq = 1.0 / (base**(torch.arange(
-            0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim))
-        return inv_freq
-
-    def _compute_cos_sin_cache(self) -> torch.Tensor:
-        """Compute the cos and sin cache."""
-        inv_freq = self._compute_inv_freq(self.base)
-        t = torch.arange(self.max_position_embeddings, dtype=torch.float)
-        freqs = torch.einsum("i,j -> ij", t, inv_freq)
-        cos = freqs.cos()
-        sin = freqs.sin()
-        cache = torch.cat((cos, sin), dim=-1)
-        return cache
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        query: torch.Tensor,
-        key: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        from vllm import _custom_ops as ops
-        self.cos_sin_cache = self.cos_sin_cache.to(positions.device)
-        query_cast = query.to(self.cache_dtype)
-        key_cast = key.to(self.cache_dtype)
-        ops.rotary_embedding(positions, query_cast, key_cast, self.head_size,
-                             self.cos_sin_cache, self.is_neox_style)
-        query = query_cast.to(query.dtype)
-        key = key_cast.to(key.dtype)
-        return query, key
-
-
 class MiniMaxText01MLP(nn.Module):
 
     def __init__(
@@ -526,20 +474,40 @@ class MiniMaxText01LinearAttention(nn.Module, MambaBase):
                                               slot_id, 32)
         return hidden
 
-    def forward(self, hidden_states: torch.Tensor, positions: torch.Tensor,
-                kv_caches: MinimaxCacheParams, **kwargs) -> torch.Tensor:
-        qkv, _ = self.qkv_proj(hidden_states)
+    def forward(self, hidden_states: torch.Tensor, output: torch.Tensor,
+                positions: torch.Tensor,
+                kv_caches: MinimaxCacheParams) -> None:
+        if not envs.VLLM_USE_V1:
+            self._forward(hidden_states, output, positions, kv_caches)
+        else:
+            torch.ops.vllm.linear_attention(
+                hidden_states,
+                output,
+                positions,
+                self.prefix,
+            )
+
+    def _forward(self, hidden_states: torch.Tensor, output: torch.Tensor,
+                 positions: torch.Tensor,
+                 kv_caches: Optional[MinimaxCacheParams]) -> None:
+        forward_context = get_forward_context()
+        attn_metadata: AttentionMetadata = forward_context.attn_metadata
+        if envs.VLLM_USE_V1 and attn_metadata is not None:
+            assert isinstance(attn_metadata, dict)
+            attn_metadata = attn_metadata[self.prefix]
+            assert isinstance(attn_metadata, LinearAttentionMetadata)
+            num_actual_tokens = attn_metadata.num_prefill_tokens + \
+                attn_metadata.num_decode_tokens
+        else:
+            num_actual_tokens = hidden_states.shape[0]
+
+        qkv, _ = self.qkv_proj(hidden_states[:num_actual_tokens])
         qkv32 = qkv.to(torch.float32)
         qkvact = torch.nn.functional.silu(qkv32)
         qkvact = qkvact.view((qkv.shape[0], self.tp_heads, -1))
         q, k, v = torch.split(qkvact, [self.head_dim] * 3, dim=-1)
-        forward_context = get_forward_context()
-        attn_metadata = forward_context.attn_metadata
         if envs.VLLM_USE_V1:
             if attn_metadata is not None:
-                assert isinstance(attn_metadata, dict)
-                attn_metadata = attn_metadata[self.prefix]
-                assert isinstance(attn_metadata, LinearAttentionMetadata)
                 kv_cache = self.kv_cache[forward_context.virtual_engine][0]
                 state_indices_tensor = attn_metadata.state_indices_tensor
 
@@ -578,13 +546,11 @@ class MiniMaxText01LinearAttention(nn.Module, MambaBase):
                 hidden = self._decode_infer(q, k, v, kv_cache,
                                             state_indices_tensor,
                                             attn_metadata)
-
         hidden = self.norm._forward(hidden)
-        gate, _ = self.output_gate(hidden_states)
+        gate, _ = self.output_gate(hidden_states[:num_actual_tokens])
         hidden = F.sigmoid(gate) * hidden
         hidden = hidden.to(hidden_states.dtype)
-        hidden, _ = self.out_proj(hidden)
-        return hidden
+        output[:num_actual_tokens], _ = self.out_proj(hidden)
 
 
 class MiniMaxText01Attention(nn.Module):
@@ -652,23 +618,23 @@ class MiniMaxText01Attention(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.attn",
         )
+        self.rotary_emb = get_rope(
+            head_size=self.head_dim,
+            rotary_dim=rotary_dim,
+            max_position=max_position,
+            base=int(rope_theta),
+            is_neox_style=True,
+            dtype=torch.float32,
+        )
         return
 
-    def forward(self, hidden_states: torch.Tensor, positions: torch.Tensor,
-                **kwargs) -> torch.Tensor:
-        forward_context = get_forward_context()
-        attn_metadata = forward_context.attn_metadata
+    def forward(self, hidden_states: torch.Tensor, output: torch.Tensor,
+                positions: torch.Tensor, **kwargs) -> None:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        if envs.VLLM_USE_V1:
-            if attn_metadata is not None:
-                q, k = attn_metadata[f"{self.prefix}.attn"].rotary_emb(
-                    positions, q, k)
-        else:
-            q, k = attn_metadata.rotary_emb(positions, q, k)
+        q, k = self.rotary_emb(positions, q, k)
         attn_output = self.attn(q, k, v)
-        output, _ = self.o_proj(attn_output)
-        return output
+        output[:], _ = self.o_proj(attn_output)
 
 
 class MiniMaxText01DecoderLayer(nn.Module):
@@ -816,16 +782,15 @@ class MiniMaxText01DecoderLayer(nn.Module):
                 is_warmup: bool = False,
                 **kwargs) -> tuple[torch.Tensor, torch.Tensor]:
 
-        forward_context = get_forward_context()
-        attn_metadata = forward_context.attn_metadata
         layernorm_input = hidden_states
         layernorm_output = self.input_layernorm(layernorm_input)
         residual = layernorm_output if self.postnorm else layernorm_input
-        self_attention_output = self.self_attn(
+        self_attention_output = torch.empty_like(layernorm_output)
+        self.self_attn(
             hidden_states=layernorm_output,
+            output=self_attention_output,
             positions=positions,
             kv_caches=kv_caches,
-            attn_metadata=attn_metadata,
         )
 
         residual = residual * self.layernorm_attention_alpha
@@ -839,8 +804,8 @@ class MiniMaxText01DecoderLayer(nn.Module):
         if self.expert_num == 1:
             hidden_states = self.mlp(layernorm_output)
         else:
-            moe_hidden_states = self.block_sparse_moe(
-                copy.deepcopy(layernorm_output))
+            moe_layernorm_output = layernorm_output.clone()
+            moe_hidden_states = self.block_sparse_moe(moe_layernorm_output)
             if self.shared_moe:
                 before_moe_dtype = layernorm_output.dtype
                 moe_hidden_fp32 = moe_hidden_states.to(torch.float32)
@@ -878,18 +843,16 @@ class MiniMaxText01DecoderLayer(nn.Module):
         return
 
 
+@support_torch_compile
 class MiniMaxText01Model(nn.Module):
 
-    def __init__(
-        self,
-        config: MiniMaxConfig,
-        model_config: Optional[ModelConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        scheduler_config=None,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+        config: MiniMaxConfig = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
+        quant_config = vllm_config.quant_config
+        cache_config = vllm_config.cache_config
+        scheduler_config = vllm_config.scheduler_config
 
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
@@ -976,24 +939,6 @@ class MiniMaxText01Model(nn.Module):
             self.minimax_cache = MinimaxCacheManager(
                 dtype=torch.float32, cache_shape=self.cache_shape)
 
-        rope_theta = getattr(config, "rope_theta", 10000)
-        head_dim = getattr(config, "head_dim", None)
-        if head_dim is None:
-            head_dim = config.hidden_size // config.num_attention_heads
-        if hasattr(config, "max_model_len") and isinstance(
-                config.max_model_len, int):
-            max_position_embeddings = min(config.max_position_embeddings,
-                                          config.max_model_len)
-        self.rotary_emb = MiniMaxText01RotaryEmbedding(
-            head_dim,
-            rotary_dim=config.rotary_dim
-            if hasattr(config, "rotary_dim") else head_dim,
-            max_position=max_position_embeddings,
-            base=int(rope_theta),
-            is_neox_style=True,
-            cache_dtype=torch.float32,
-        )
-
         norm_kwargs = {}
         if hasattr(config, "rms_norm_eps"):
             norm_kwargs["eps"] = config.rms_norm_eps
@@ -1043,12 +988,11 @@ class MiniMaxText01Model(nn.Module):
         attn_metadata = forward_context.attn_metadata
         if not envs.VLLM_USE_V1 and attn_metadata is None:
             return None
-        if "request_ids_to_seq_ids" not in kwargs:
-            kwargs["request_ids_to_seq_ids"] = {}
-        if "finished_requests_ids" not in kwargs:
-            kwargs["finished_requests_ids"] = []
-
         if not envs.VLLM_USE_V1:
+            if "request_ids_to_seq_ids" not in kwargs:
+                kwargs["request_ids_to_seq_ids"] = {}
+            if "finished_requests_ids" not in kwargs:
+                kwargs["finished_requests_ids"] = []
             (
                 minimax_cache_tensors,
                 state_indices_tensor,
@@ -1077,16 +1021,6 @@ class MiniMaxText01Model(nn.Module):
 
         for i in range(self.start_layer, self.end_layer):
             layer = self.layers[i]
-            if attn_metadata is not None:
-                # TODO (tdoublep): this whole thing with the rotary_emb is
-                # weird. we shouldn't be passing it via attn_metadata imo.
-                if envs.VLLM_USE_V1:
-                    if isinstance(layer.self_attn, MiniMaxText01Attention):
-                        attn_metadata[layer.prefix +
-                                      ".attn"].rotary_emb = self.rotary_emb
-                else:
-                    attn_metadata.rotary_emb = self.rotary_emb
-
             _caches = None
             if not envs.VLLM_USE_V1 and isinstance(
                     layer.self_attn, MiniMaxText01LinearAttention):
@@ -1120,7 +1054,6 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid):
 
         super().__init__()
         config = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
         self.config = config
         self.lora_config = lora_config
@@ -1133,13 +1066,8 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid):
         self.unpadded_vocab_size = self.config.vocab_size
         if hasattr(vllm_config.model_config, "max_model_len"):
             self.config.max_model_len = vllm_config.model_config.max_model_len
-        self.model = MiniMaxText01Model(
-            self.config,
-            model_config=vllm_config.model_config,
-            cache_config=vllm_config.cache_config,
-            quant_config=quant_config,
-            scheduler_config=vllm_config.scheduler_config,
-            prefix=maybe_prefix(prefix, "model"))
+        self.model = MiniMaxText01Model(vllm_config=vllm_config,
+                                        prefix=maybe_prefix(prefix, "model"))
         if get_pp_group().is_last_rank:
             self.lm_head = ParallelLMHead(
                 self.unpadded_vocab_size,
@@ -1469,3 +1397,35 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid):
             tp_size=parallel_config.tensor_parallel_size,
             head_dim=hf_config.head_dim,
         )
+
+
+def linear_attention(
+    hidden_states: torch.Tensor,
+    output: torch.Tensor,
+    positions: torch.Tensor,
+    layer_name: str,
+) -> None:
+    forward_context: ForwardContext = get_forward_context()
+    self = forward_context.no_compile_layers[layer_name]
+    self._forward(hidden_states=hidden_states,
+                  output=output,
+                  positions=positions,
+                  kv_caches=None)
+
+
+def linear_attention_fake(
+    hidden_states: torch.Tensor,
+    output: torch.Tensor,
+    positions: torch.Tensor,
+    layer_name: str,
+) -> None:
+    return
+
+
+direct_register_custom_op(
+    op_name="linear_attention",
+    op_func=linear_attention,
+    mutates_args=["output"],
+    fake_impl=linear_attention_fake,
+    dispatch_key=current_platform.dispatch_key,
+)

From 4e4d017b6f70c729e7c78f74e4328a4ebca7b8ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hyogeun=20Oh=20=28=EC=98=A4=ED=9A=A8=EA=B7=BC=29?=
 <ohg3417@gmail.com>
Date: Thu, 28 Aug 2025 02:17:29 +0900
Subject: [PATCH 100/112] [Docs] Fix warnings in `mkdocs build` (continued)
 (#23743)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Zerohertz <ohg3417@gmail.com>
Signed-off-by: Hyogeun Oh (오효근) <ohg3417@gmail.com>
---
 vllm/core/block/naive_block.py               |  2 +-
 vllm/core/block/prefix_caching_block.py      |  2 +-
 vllm/core/scheduler.py                       |  2 +-
 vllm/v1/attention/backends/cpu_attn.py       |  3 ++-
 vllm/v1/attention/backends/flash_attn.py     |  3 ++-
 vllm/v1/attention/backends/flashinfer.py     |  8 +++-----
 vllm/v1/attention/backends/flex_attention.py |  3 ++-
 vllm/v1/attention/backends/pallas.py         |  5 +++--
 vllm/v1/attention/backends/rocm_aiter_fa.py  |  3 ++-
 vllm/v1/attention/backends/tree_attn.py      |  3 ++-
 vllm/v1/attention/backends/triton_attn.py    |  3 ++-
 vllm/v1/attention/backends/xformers.py       |  3 ++-
 vllm/v1/core/encoder_cache_manager.py        |  8 ++++----
 vllm/v1/core/kv_cache_coordinator.py         |  3 ++-
 vllm/v1/core/kv_cache_manager.py             | 11 ++++++-----
 vllm/v1/executor/ray_distributed_executor.py |  3 ++-
 vllm/v1/metrics/prometheus.py                |  2 +-
 vllm/v1/sample/logits_processor/interface.py |  4 ++--
 vllm/v1/sample/rejection_sampler.py          |  2 +-
 vllm/v1/sample/tpu/sampler.py                |  2 +-
 vllm/v1/structured_output/backend_types.py   |  4 ++--
 vllm/v1/worker/gpu_input_batch.py            |  3 ---
 vllm/v1/worker/gpu_model_runner.py           |  2 +-
 vllm/v1/worker/tpu_model_runner.py           | 10 +++++-----
 vllm/v1/worker/utils.py                      |  8 ++++----
 vllm/v1/worker/worker_base.py                |  4 ++--
 26 files changed, 56 insertions(+), 50 deletions(-)

diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
index dae6ead04e9c9..7d9b32cd4b674 100644
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -207,7 +207,7 @@ class NaiveBlockAllocator(BlockAllocator):
 
         Args:
             absolute_id (int): The absolute block id for the block 
-            in whole allocator.
+                in whole allocator.
 
         Returns:
             int: The zero-offset block id on certain device.
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index 2913a01bf34a5..a21d69323abbc 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -61,7 +61,7 @@ class PrefixCachingBlockAllocator(BlockAllocator):
     Args:
         num_blocks (int): The total number of blocks to manage.
         block_size (int): The size of each block in tokens.
-        block_ids(Optional[Iterable[int]], optional): An optional iterable of
+        block_ids (Optional[Iterable[int]], optional): An optional iterable of
             block IDs. If not provided, block IDs will be assigned sequentially
             from 0 to num_blocks - 1.
     """
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 63894e7f5dc8b..c89f3f6632642 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -657,7 +657,7 @@ class Scheduler:
                 `budget.num_batched_tokens` has not enough capacity to schedule
                 all tokens.
             partial_prefill_metadata: information about the partial prefills
-            that are currently running
+                that are currently running
 
         Returns:
             SchedulerRunningOutputs.
diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py
index 973979fdf7dfd..ced8234a7b433 100644
--- a/vllm/v1/attention/backends/cpu_attn.py
+++ b/vllm/v1/attention/backends/cpu_attn.py
@@ -491,7 +491,8 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
             query: shape = [num_tokens, num_heads * head_size]
             key: shape = [num_tokens, num_kv_heads * head_size]
             value: shape = [num_tokens, num_kv_heads * head_size]
-            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
+            kv_cache: shape =
+                [2, num_blocks, block_size * num_kv_heads * head_size]
                 NOTE: kv_cache will be an empty tensor with shape [0]
                 for profiling run.
             attn_metadata: Metadata for attention.
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 6e7096de924ca..dd2b956d4fa3d 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -438,7 +438,8 @@ class FlashAttentionImpl(AttentionImpl):
             query: shape = [num_tokens, num_heads, head_size]
             key: shape = [num_tokens, num_kv_heads, head_size]
             value: shape = [num_tokens, num_kv_heads, head_size]
-            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            kv_cache: shape =
+                [2, num_blocks, block_size, num_kv_heads, head_size]
             attn_metadata: Metadata for attention.
         Returns:
             shape = [num_tokens, num_heads * head_size]
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 1115fc606b055..70d3471a47259 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -637,11 +637,9 @@ class FlashInferImpl(AttentionImpl):
             query: shape = [num_tokens, num_heads, head_size]
             key: shape = [num_tokens, num_kv_heads, head_size]
             value: shape = [num_tokens, num_kv_heads, head_size]
-            kv_cache: shape -
-            # NHD: [num_blocks, 2, block_size, num_kv_heads, head_size]
-            # HND: [num_blocks, 2,  num_kv_heads, block_size, head_size]
-
-
+            kv_cache: KV cache tensor with different possible shapes:
+                - NHD: [num_blocks, 2, block_size, num_kv_heads, head_size]
+                - HND: [num_blocks, 2, num_kv_heads, block_size, head_size]
             attn_metadata: Metadata for attention.
         Returns:
             shape = [num_tokens, num_heads * head_size]
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index 458562ebc8d27..a596f6b2b32a4 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -689,7 +689,8 @@ class FlexAttentionImpl(AttentionImpl):
             query: shape = [num_tokens, num_heads, head_size]
             key: shape = [num_tokens, num_kv_heads, head_size]
             value: shape = [num_tokens, num_kv_heads, head_size]
-            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            kv_cache: shape =
+                [2, num_blocks, block_size, num_kv_heads, head_size]
             attn_metadata: Metadata for attention.
         Returns:
             shape = [num_tokens, num_heads * head_size]
diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
index fd97db0abb84f..26f9abf13d0ed 100644
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -235,7 +235,8 @@ class PallasAttentionBackendImpl(AttentionImpl):
             query: shape = [num_tokens, num_heads * head_size]
             key: shape = [num_tokens, num_kv_heads * head_size]
             value: shape = [num_tokens, num_kv_heads * head_size]
-            kv_cache = [num_blocks, block_size, num_kv_heads * 2, head_size]
+            kv_cache: shape =
+                [num_blocks, block_size, num_kv_heads * 2, head_size]
             attn_metadata: Metadata for attention.
         Returns:
             shape = [num_tokens, num_heads * head_size]
@@ -329,7 +330,7 @@ def write_to_kv_cache(
     Args:
         key: shape = [num_tokens, num_kv_heads, head_size]
         value: shape = [num_tokens, num_kv_heads, head_size]
-        kv_cache = [num_blocks, block_size, num_kv_heads * 2, head_size]
+        kv_cache: shape = [num_blocks, block_size, num_kv_heads * 2, head_size]
         num_slices_per_kv_cache_update_block: int
     """
     _, page_size, num_combined_kv_heads, head_size = kv_cache.shape
diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
index 403ad8e88a958..173a0a255e491 100644
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -429,7 +429,8 @@ class AiterFlashAttentionImpl(AttentionImpl):
             query: shape = [num_tokens, num_heads, head_size]
             key: shape = [num_tokens, num_kv_heads, head_size]
             value: shape = [num_tokens, num_kv_heads, head_size]
-            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            kv_cache: shape =
+                [2, num_blocks, block_size, num_kv_heads, head_size]
             attn_metadata: Metadata for attention.
         Returns:
             shape = [num_tokens, num_heads * head_size]
diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py
index c93223a340839..b96d957a150b5 100644
--- a/vllm/v1/attention/backends/tree_attn.py
+++ b/vllm/v1/attention/backends/tree_attn.py
@@ -362,7 +362,8 @@ class TreeAttentionImpl(AttentionImpl):
             query: shape = [num_tokens, num_heads, head_size]
             key: shape = [num_tokens, num_kv_heads, head_size]
             value: shape = [num_tokens, num_kv_heads, head_size]
-            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            kv_cache: shape =
+                [2, num_blocks, block_size, num_kv_heads, head_size]
             attn_metadata: Metadata for attention.
         Returns:
             shape = [num_tokens, num_heads * head_size]
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index b12036c599799..a37a7f6811ef9 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -285,7 +285,8 @@ class TritonAttentionImpl(AttentionImpl):
             query: shape = [num_tokens, num_heads, head_size]
             key: shape = [num_tokens, num_kv_heads, head_size]
             value: shape = [num_tokens, num_kv_heads, head_size]
-            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            kv_cache: shape =
+                [2, num_blocks, block_size, num_kv_heads, head_size]
             attn_metadata: Metadata for attention.
         Returns:
             shape = [num_tokens, num_heads * head_size]
diff --git a/vllm/v1/attention/backends/xformers.py b/vllm/v1/attention/backends/xformers.py
index e0eb7d8be9746..7f888c1135743 100644
--- a/vllm/v1/attention/backends/xformers.py
+++ b/vllm/v1/attention/backends/xformers.py
@@ -330,7 +330,8 @@ class XFormersAttentionImpl(AttentionImpl):
             query: shape = [num_tokens, num_heads, head_size]
             key: shape = [num_tokens, num_kv_heads, head_size]
             value: shape = [num_tokens, num_kv_heads, head_size]
-            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            kv_cache: shape =
+                [2, num_blocks, block_size, num_kv_heads, head_size]
             attn_metadata: Metadata for attention.
         Returns:
             shape = [num_tokens, num_heads * head_size]
diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py
index c9d18033a1988..bd2ec036834b2 100644
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -255,9 +255,9 @@ def compute_encoder_budget(
 
     Returns:
         - Compute budget for encoder execution, measured in number of tokens
-          from the input sequence.
+            from the input sequence.
         - Space budget for encoder cache size, measured in number of tokens
-          from the input sequence.
+            from the input sequence.
     """
     if mm_registry.supports_multimodal_inputs(model_config):
         max_tokens_by_modality = mm_registry \
@@ -303,9 +303,9 @@ def compute_mm_encoder_budget(
 
     Returns:
         - Compute budget for encoder execution, measured in number of tokens
-          from the input sequence.
+            from the input sequence.
         - Space budget for encoder cache size, measured in number of tokens
-          from the input sequence.
+            from the input sequence.
     """
 
     if not max_tokens_by_modality:
diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
index f082ad00f2e35..9421341f990c8 100644
--- a/vllm/v1/core/kv_cache_coordinator.py
+++ b/vllm/v1/core/kv_cache_coordinator.py
@@ -119,7 +119,8 @@ class KVCacheCoordinator(ABC):
 
         Args:
             request: The request.
-            num_tokens: The total number of tokens that need to be cached 
+            num_computed_tokens: The total number of tokens
+                that need to be cached
                 (including tokens that are already cached).
         """
         for manager in self.single_type_managers:
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index b427a9c497fef..87a11fe58a048 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -54,14 +54,15 @@ class KVCacheBlocks:
     def get_block_ids(
         self,
         allow_none: bool = False,
-    ):
+    ) -> Optional[tuple[list[int], ...]]:
         """
         Converts the KVCacheBlocks instance to block_ids.
-        
+
         Returns:
-            tuple[list[int], ...]: A tuple of lists where
-            * the outer tuple corresponds to KV cache groups
-            * each inner list contains the block_ids of the blocks in that group
+            tuple[list[int], ...]: A tuple of lists where:
+                - the outer tuple corresponds to KV cache groups
+                - each inner list contains the block_ids of the blocks in that
+                  group
         """
         if allow_none and all(len(group) == 0 for group in self.blocks):
             return None
diff --git a/vllm/v1/executor/ray_distributed_executor.py b/vllm/v1/executor/ray_distributed_executor.py
index c05ad1966d611..8394ae788ab01 100644
--- a/vllm/v1/executor/ray_distributed_executor.py
+++ b/vllm/v1/executor/ray_distributed_executor.py
@@ -8,6 +8,7 @@ from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
 from vllm.executor.ray_distributed_executor import (  # noqa
     RayDistributedExecutor as RayDistributedExecutorV0)
 from vllm.logger import init_logger
+from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.outputs import ModelRunnerOutput
@@ -64,7 +65,7 @@ class RayDistributedExecutor(RayDistributedExecutorV0, Executor):
 
     def execute_model(
         self,
-        scheduler_output,
+        scheduler_output: SchedulerOutput,
     ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]:
         """Execute the model on the Ray workers.
 
diff --git a/vllm/v1/metrics/prometheus.py b/vllm/v1/metrics/prometheus.py
index 61ba5d66cb31a..a43cf9ce255e6 100644
--- a/vllm/v1/metrics/prometheus.py
+++ b/vllm/v1/metrics/prometheus.py
@@ -36,7 +36,7 @@ def setup_multiprocess_prometheus():
                        "and vLLM will properly handle cleanup.")
 
 
-def get_prometheus_registry():
+def get_prometheus_registry() -> CollectorRegistry:
     """Get the appropriate prometheus registry based on multiprocessing 
     configuration.
     
diff --git a/vllm/v1/sample/logits_processor/interface.py b/vllm/v1/sample/logits_processor/interface.py
index 16cd00943db8d..683fc7c00dfb2 100644
--- a/vllm/v1/sample/logits_processor/interface.py
+++ b/vllm/v1/sample/logits_processor/interface.py
@@ -91,7 +91,7 @@ class LogitsProcessor(ABC):
         to each forward pass.
 
         Args:
-            batch_update is non-None iff there have been
-            changes to the batch makeup.
+            batch_update: Non-None iff there have been changes
+                to the batch makeup.
         """
         raise NotImplementedError
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index b2354c53302ad..2d9ce3101b6c9 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -68,7 +68,7 @@ class RejectionSampler(nn.Module):
                 different requests are flattened into a single tensor because
                 this is the shape of the output logits.
                 NOTE: `target_logits` can be updated in place to save memory.
-            bonus_token_ids_tensor (torch.Tensor):
+            bonus_token_ids (torch.Tensor):
                 A tensor containing bonus tokens. Shape is [batch_size, 1].
                 Bonus tokens are added to the end of the sequence if all
                 proposed tokens are accepted. We generate the bonus tokens
diff --git a/vllm/v1/sample/tpu/sampler.py b/vllm/v1/sample/tpu/sampler.py
index 04545d587e4a9..e84136e3a6d07 100644
--- a/vllm/v1/sample/tpu/sampler.py
+++ b/vllm/v1/sample/tpu/sampler.py
@@ -89,7 +89,7 @@ class Sampler(nn.Module):
         Gather logprobs for topk and sampled/prompt token.
 
         Args:
-          logits: (num tokens) x (vocab) tensor
+          logprobs: (num tokens) x (vocab) tensor
           num_logprobs: minimum number of logprobs to
                         retain per token
           token_ids: prompt tokens (if prompt logprobs)
diff --git a/vllm/v1/structured_output/backend_types.py b/vllm/v1/structured_output/backend_types.py
index d500783aa4b30..9a53aa7a1ad10 100644
--- a/vllm/v1/structured_output/backend_types.py
+++ b/vllm/v1/structured_output/backend_types.py
@@ -110,7 +110,7 @@ class StructuredOutputBackend(ABC):
 
         Args:
             request_type (StructuredOutputOptions): The type of structured
-              output request.
+                output request.
             grammar_spec (str): The grammar specification to compile.
 
         Returns:
@@ -124,7 +124,7 @@ class StructuredOutputBackend(ABC):
 
         Args:
             max_num_seqs (int): The maximum number of sequences for which
-              to allocate the bitmask.
+                to allocate the bitmask.
         """
 
     @abstractmethod
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 284af6bfedce0..f4c2f45df5954 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -525,9 +525,6 @@ class InputBatch:
         Any consecutive empty indices at the very end of the list are not
         filled.
 
-        Args:
-          empty_req_indices: empty indices which may be filled.
-
         Returns:
           swaps: list of (from,to) swap tuples for moved requests
           empty_req_indices: indices not filled by condensation
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 20d2d20ba0967..01c90b2ea38d3 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2955,7 +2955,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         Args:
             kv_cache_config: The KV cache config
             kv_cache_raw_tensors: The KV cache buffer of each layer, with
-            correct size but uninitialized shape.
+                correct size but uninitialized shape.
         Returns:
             Dict[str, torch.Tensor]: A map between layer names to their
             corresponding memory buffer for KV cache.
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index d364236604274..70ffde39ca333 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -552,7 +552,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         return kv_cache_spec
 
     def _get_slot_mapping_metadata(self, num_reqs,
-                                   num_scheduled_tokens_per_req):
+                                   num_scheduled_tokens_per_req) -> np.ndarray:
         """
         Computes metadata for mapping slots to blocks in the key-value (KV)
         cache for a batch of requests.
@@ -565,15 +565,15 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         Args:
             num_reqs (int): Number of requests in the current batch.
             num_scheduled_tokens_per_req (int or np.ndarray): Number of tokens
-            to be scheduled for each request.
+                to be scheduled for each request.
 
         Returns:
             np.ndarray: A 2D array of shape (total_block_len, 3), where each row
-            contains:
+                contains:
                 - kv_cache_start_index (int): The starting index in the KV cache
-                    for the corresponding slice.
+                  for the corresponding slice.
                 - new_kv_start_index (int): The starting index in the new KV
-                    cache for the corresponding slice.
+                  cache for the corresponding slice.
                 - slice_len (int): The length of the slice.
         """
         slices_start = self.input_batch.num_computed_tokens_cpu[:num_reqs]
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index f407534687662..a519336e41616 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -172,10 +172,10 @@ def scatter_mm_placeholders(
 
     Args:
         embeds: The multimodal embeddings.
-          Shape: `(num_embeds, embed_dim)`
+            Shape: `(num_embeds, embed_dim)`
         is_embed: A boolean mask indicating which positions in the placeholder
-          tokens need to be filled with multimodal embeddings.
-          Shape: `(num_placeholders, num_embeds)`
+            tokens need to be filled with multimodal embeddings.
+            Shape: `(num_placeholders, num_embeds)`
     """
     if is_embed is None:
         return embeds
@@ -278,7 +278,7 @@ def bind_kv_cache(
     Args:
         kv_caches: The allocated kv_caches with layer names as keys.
         forward_context: The global forward context containing all Attention
-        layers with layer names as keys.
+            layers with layer names as keys.
         runner_kv_caches: The kv_cache declared by ModelRunner.
     """
     # Bind kv_caches to ModelRunner
diff --git a/vllm/v1/worker/worker_base.py b/vllm/v1/worker/worker_base.py
index 9c93754f93f81..038ce4b54f960 100644
--- a/vllm/v1/worker/worker_base.py
+++ b/vllm/v1/worker/worker_base.py
@@ -36,8 +36,8 @@ class WorkerBase(WorkerBaseV0):
             local_rank: Local device index
             rank: Global rank in distributed setup
             distributed_init_method: Distributed initialization method
-            is_driver_worker: Whether this worker handles driver 
-            responsibilities
+            is_driver_worker: Whether this worker handles driver
+                responsibilities
         """
         # Configuration storage
         super().__init__(vllm_config=vllm_config)

From 3c0ef769bace3d48b276c7233ed6f39fe03f95b7 Mon Sep 17 00:00:00 2001
From: Eli Uriegas <1700823+seemethere@users.noreply.github.com>
Date: Wed, 27 Aug 2025 10:41:48 -0700
Subject: [PATCH 101/112] ci: Add arm64 docker build to release pipeline
 (#23210)

Signed-off-by: Eli Uriegas <eliuriegas@meta.com>
Signed-off-by: Eli Uriegas <1700823+seemethere@users.noreply.github.com>
---
 .buildkite/release-pipeline.yaml | 38 +++++++++++++++++++++++++++-----
 1 file changed, 32 insertions(+), 6 deletions(-)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index f96c38bf57db7..86aae426c258c 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -7,7 +7,7 @@ steps:
     commands:
       # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
       # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       - "bash .buildkite/scripts/upload-wheels.sh"
@@ -62,23 +62,49 @@ steps:
     env:
       DOCKER_BUILDKIT: "1"
 
-  - block: "Build release image"
+  - block: "Build release image (x86)"
     depends_on: ~
     key: block-release-image-build
 
-  - label: "Build release image"
+  - label: "Build release image (x86)"
     depends_on: block-release-image-build
-    id: build-release-image
+    id: build-release-image-x86
     agents:
       queue: cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
+      # re-tag to default image tag and push, just in case arm64 build fails
+      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 
+  - label: "Build release image (arm64)"
+    depends_on: block-release-image-build
+    id: build-release-image-arm64
+    agents:
+      queue: arm64_cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
+
+  # Add job to create multi-arch manifest
+  - label: "Create multi-arch manifest"
+    depends_on:
+      - build-release-image-x86
+      - build-release-image-arm64
+    id: create-multi-arch-manifest
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
+      - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+
   - label: "Annotate release workflow"
     depends_on:
-      - build-release-image
+      - create-multi-arch-manifest
       - build-wheel-cuda-12-8
       - build-wheel-cuda-12-6
       - build-wheel-cuda-11-8

From 0585a9e73c072a8cbb1a64bea3c26dd0d2dde402 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 27 Aug 2025 20:03:05 +0100
Subject: [PATCH 102/112] Disable `torch.compile` for dynamic rope models in
 Transformers backend (#23738)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/transformers.py | 25 +++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index fc242d1adafd0..dffc347a73668 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -88,6 +88,23 @@ def log_replacement(name: str, old_module: nn.Module, new_module: nn.Module):
     logger.debug("%s: %s -> %s", name, old_module, new_module)
 
 
+def can_enable_torch_compile(vllm_config: VllmConfig) -> bool:
+    """
+    Callable to be passed to `@support_torch_compile`'s `enable_if` argument.
+
+    Defaults to `True` but is disabled in the following situations:
+
+    - The model uses dynamic rope scaling.
+    """
+    enable = True
+    text_config = vllm_config.model_config.hf_config.get_text_config()
+    # Dynamic rope scaling is not compatible with torch.compile
+    rope_scaling: dict = getattr(text_config, "rope_scaling", None) or {}
+    if rope_scaling.get("rope_type") == "dynamic":
+        enable = False
+    return enable
+
+
 def replace_linear_class(
     linear: nn.Linear, style: Literal["colwise", "rowwise"],
     quant_config: QuantizationConfig
@@ -641,7 +658,7 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
 
-@support_torch_compile
+@support_torch_compile(enable_if=can_enable_torch_compile)
 class TransformersModel(TransformersBase):
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
@@ -653,7 +670,7 @@ class TransformersModel(TransformersBase):
         })
 
 
-@support_torch_compile
+@support_torch_compile(enable_if=can_enable_torch_compile)
 class TransformersForCausalLM(TransformersBase):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -709,12 +726,14 @@ def flatten_and_concat(x: list[torch.Tensor]) -> torch.Tensor:
     info=MultiModalProcessingInfo,
     dummy_inputs=MultiModalDummyInputsBuilder)
 @support_torch_compile(
+    # set `positions` to last dim to support Qwen-mrope
     dynamic_arg_dims={
         "input_ids": 0,
         "positions": -1,
         "intermediate_tensors": 0,
         "inputs_embeds": 0,
-    })  # set `positions` to last dim to support Qwen-mrope
+    },
+    enable_if=can_enable_torch_compile)
 class TransformersForMultimodalLM(TransformersForCausalLM, SupportsMultiModal):
     # Backwards compatibility for prev released models. State dicts back then
     # had different formats and cannot be loaded with `AutoModel` mapping as is

From 8bf6266a17933b130f94f6d53f32ac029ed8ba1b Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Wed, 27 Aug 2025 13:24:31 -0700
Subject: [PATCH 103/112] [Multimodal] Generate mm_hash based on request
 metadata when caching is turned off (#23690)

Signed-off-by: Roger Wang <hey@rogerw.io>
---
 vllm/inputs/preprocess.py                     | 71 ++++++++++++++++---
 vllm/model_executor/models/deepseek_vl2.py    |  3 +
 vllm/model_executor/models/h2ovl.py           |  3 +
 vllm/model_executor/models/llava.py           |  8 ++-
 vllm/model_executor/models/mllama.py          |  8 ++-
 vllm/model_executor/models/paligemma.py       |  8 ++-
 vllm/model_executor/models/pixtral.py         |  2 +
 .../models/prithvi_geospatial_mae.py          |  7 +-
 vllm/model_executor/models/transformers.py    |  7 +-
 vllm/model_executor/models/voxtral.py         |  2 +
 vllm/multimodal/processing.py                 | 36 ++++++++--
 vllm/v1/engine/processor.py                   | 48 +++++++++++++
 12 files changed, 179 insertions(+), 24 deletions(-)

diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index f0d0cab3df3d9..fff9c42fe36fe 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -257,6 +257,8 @@ class InputPreprocessor:
         mm_processor_kwargs: Optional[Mapping[str, object]],
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> MultiModalInputs:
         """
         Apply the model's multi-modal processor to a multi-modal prompt,
@@ -273,10 +275,13 @@ class InputPreprocessor:
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
 
-        return mm_processor.apply(prompt,
-                                  mm_data,
-                                  hf_processor_mm_kwargs=mm_processor_kwargs,
-                                  tokenization_kwargs=tokenization_kwargs)
+        return mm_processor.apply(
+            prompt,
+            mm_data,
+            hf_processor_mm_kwargs=mm_processor_kwargs,
+            tokenization_kwargs=tokenization_kwargs,
+            mm_hash_overrides=mm_hash_overrides,
+        )
 
     async def _process_multimodal_async(
         self,
@@ -285,6 +290,8 @@ class InputPreprocessor:
         mm_processor_kwargs: Optional[Mapping[str, object]],
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> MultiModalInputs:
         """
         Async version of
@@ -301,10 +308,13 @@ class InputPreprocessor:
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
 
-        return mm_processor.apply(prompt,
-                                  mm_data,
-                                  hf_processor_mm_kwargs=mm_processor_kwargs,
-                                  tokenization_kwargs=tokenization_kwargs)
+        return mm_processor.apply(
+            prompt,
+            mm_data,
+            hf_processor_mm_kwargs=mm_processor_kwargs,
+            tokenization_kwargs=tokenization_kwargs,
+            mm_hash_overrides=mm_hash_overrides,
+        )
 
     def _process_embeds(
         self,
@@ -341,6 +351,8 @@ class InputPreprocessor:
         parsed_content: TokensPrompt,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> Union[TokenInputs, MultiModalInputs]:
         prompt_token_ids = parsed_content["prompt_token_ids"]
         token_type_ids = parsed_content.get("token_type_ids")
@@ -353,6 +365,7 @@ class InputPreprocessor:
                 parsed_content.get("mm_processor_kwargs"),
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
+                mm_hash_overrides=mm_hash_overrides,
             )
         else:
             inputs = token_inputs(
@@ -370,6 +383,8 @@ class InputPreprocessor:
         parsed_content: TokensPrompt,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> Union[TokenInputs, MultiModalInputs]:
         prompt_token_ids = parsed_content["prompt_token_ids"]
         token_type_ids = parsed_content.get("token_type_ids")
@@ -382,6 +397,7 @@ class InputPreprocessor:
                 parsed_content.get("mm_processor_kwargs"),
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
+                mm_hash_overrides=mm_hash_overrides,
             )
         else:
             inputs = token_inputs(
@@ -399,6 +415,8 @@ class InputPreprocessor:
         parsed_content: TextPrompt,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> Union[TokenInputs, MultiModalInputs]:
         prompt_text = parsed_content["prompt"]
 
@@ -410,6 +428,7 @@ class InputPreprocessor:
                 parsed_content.get("mm_processor_kwargs"),
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
+                mm_hash_overrides=mm_hash_overrides,
             )
         else:
             prompt_token_ids = self._tokenize_prompt(
@@ -432,6 +451,8 @@ class InputPreprocessor:
         parsed_content: TextPrompt,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> Union[TokenInputs, MultiModalInputs]:
         prompt_text = parsed_content["prompt"]
 
@@ -443,6 +464,7 @@ class InputPreprocessor:
                 parsed_content.get("mm_processor_kwargs"),
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
+                mm_hash_overrides=mm_hash_overrides,
             )
         else:
             prompt_token_ids = await self._tokenize_prompt_async(
@@ -465,6 +487,8 @@ class InputPreprocessor:
         prompt: SingletonPrompt,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> SingletonInputs:
         """
         Extract the singleton inputs from a prompt.
@@ -486,18 +510,21 @@ class InputPreprocessor:
             return self._process_tokens(
                 parsed["content"],
                 lora_request=lora_request,
+                mm_hash_overrides=mm_hash_overrides,
             )
         if parsed["type"] == "text":
             return self._process_text(
                 parsed["content"],
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
+                mm_hash_overrides=mm_hash_overrides,
             )
         if parsed["type"] == "str":
             return self._process_text(
                 TextPrompt(prompt=parsed["content"]),
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
+                mm_hash_overrides=mm_hash_overrides,
             )
 
         assert_never(parsed)
@@ -507,6 +534,8 @@ class InputPreprocessor:
         prompt: SingletonPrompt,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> SingletonInputs:
         """
         Async version of
@@ -520,18 +549,21 @@ class InputPreprocessor:
             return await self._process_tokens_async(
                 parsed["content"],
                 lora_request=lora_request,
+                mm_hash_overrides=mm_hash_overrides,
             )
         if parsed["type"] == "text":
             return await self._process_text_async(
                 parsed["content"],
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
+                mm_hash_overrides=mm_hash_overrides,
             )
         if parsed["type"] == "str":
             return await self._process_text_async(
                 TextPrompt(prompt=parsed["content"]),
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
+                mm_hash_overrides=mm_hash_overrides,
             )
 
         assert_never(parsed)
@@ -641,6 +673,8 @@ class InputPreprocessor:
         self,
         prompt: PromptType,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> EncoderDecoderInputs:
         """
         For encoder/decoder models only:
@@ -682,6 +716,7 @@ class InputPreprocessor:
             encoder_inputs = self._prompt_to_llm_inputs(
                 prompt["encoder_prompt"],
                 tokenization_kwargs=tokenization_kwargs,
+                mm_hash_overrides=mm_hash_overrides,
             )
             if (decoder_input := prompt["decoder_prompt"]) is None:
                 decoder_inputs = None
@@ -697,6 +732,7 @@ class InputPreprocessor:
             inputs = self._prompt_to_llm_inputs(
                 prompt,
                 tokenization_kwargs=tokenization_kwargs,
+                mm_hash_overrides=mm_hash_overrides,
             )
             if self.model_config.is_multimodal_model:
                 # Encoder-Decoder Multimodal model
@@ -712,6 +748,8 @@ class InputPreprocessor:
         self,
         prompt: PromptType,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> EncoderDecoderInputs:
         """
         Async version of
@@ -724,6 +762,7 @@ class InputPreprocessor:
             encoder_task = self._prompt_to_llm_inputs_async(
                 prompt["encoder_prompt"],
                 tokenization_kwargs=tokenization_kwargs,
+                mm_hash_overrides=mm_hash_overrides,
             )
 
             if (decoder_input := prompt["decoder_prompt"]) is None:
@@ -733,6 +772,7 @@ class InputPreprocessor:
                 decoder_task = self._prompt_to_llm_inputs_async(
                     decoder_input,
                     tokenization_kwargs=tokenization_kwargs,
+                    mm_hash_overrides=mm_hash_overrides,
                 )
 
                 encoder_inputs, decoder_inputs = await asyncio.gather(
@@ -748,6 +788,7 @@ class InputPreprocessor:
             inputs = await self._prompt_to_llm_inputs_async(
                 prompt,
                 tokenization_kwargs=tokenization_kwargs,
+                mm_hash_overrides=mm_hash_overrides,
             )
             if self.model_config.is_multimodal_model:
                 # Encoder-Decoder Multimodal model
@@ -774,6 +815,8 @@ class InputPreprocessor:
         prompt: SingletonPrompt,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> DecoderOnlyInputs:
         """
         For decoder-only models:
@@ -794,6 +837,7 @@ class InputPreprocessor:
             prompt,
             tokenization_kwargs=tokenization_kwargs,
             lora_request=lora_request,
+            mm_hash_overrides=mm_hash_overrides,
         )
 
         return self._build_decoder_only_llm_inputs(prompt_comps)
@@ -803,6 +847,8 @@ class InputPreprocessor:
         prompt: SingletonPrompt,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> DecoderOnlyInputs:
         """
         Async version of
@@ -812,6 +858,7 @@ class InputPreprocessor:
             prompt,
             tokenization_kwargs=tokenization_kwargs,
             lora_request=lora_request,
+            mm_hash_overrides=mm_hash_overrides,
         )
 
         return self._build_decoder_only_llm_inputs(prompt_comps)
@@ -821,6 +868,8 @@ class InputPreprocessor:
         prompt: PromptType,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> ProcessorInputs:
         """Preprocess the input prompt."""
         if self.model_config.is_encoder_decoder:
@@ -829,6 +878,7 @@ class InputPreprocessor:
             return self._process_encoder_decoder_prompt(
                 prompt,
                 tokenization_kwargs,
+                mm_hash_overrides=mm_hash_overrides,
             )
 
         if is_explicit_encoder_decoder_prompt(prompt):
@@ -840,6 +890,7 @@ class InputPreprocessor:
             prompt,
             tokenization_kwargs=tokenization_kwargs,
             lora_request=lora_request,
+            mm_hash_overrides=mm_hash_overrides,
         )
 
     async def preprocess_async(
@@ -847,6 +898,8 @@ class InputPreprocessor:
         prompt: PromptType,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> ProcessorInputs:
         """
         Async version of
@@ -858,6 +911,7 @@ class InputPreprocessor:
             return await self._process_encoder_decoder_prompt_async(
                 prompt,
                 tokenization_kwargs,
+                mm_hash_overrides=mm_hash_overrides,
             )
 
         if is_explicit_encoder_decoder_prompt(prompt):
@@ -869,6 +923,7 @@ class InputPreprocessor:
             prompt,
             tokenization_kwargs=tokenization_kwargs,
             lora_request=lora_request,
+            mm_hash_overrides=mm_hash_overrides,
         )
 
     def clear_cache(self) -> None:
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index ceb5e1364b68d..1bd2802a86838 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -290,6 +290,7 @@ class DeepseekVL2MultiModalProcessor(
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         # The processor logic is different for len(images) <= 2 vs > 2
         # Since the processing cache assumes that the processor output is
@@ -301,6 +302,7 @@ class DeepseekVL2MultiModalProcessor(
                 mm_data_items=mm_data_items,
                 hf_processor_mm_kwargs=hf_processor_mm_kwargs,
                 tokenization_kwargs=tokenization_kwargs,
+                mm_hash_overrides=mm_hash_overrides,
             )
 
         return super()._cached_apply_hf_processor(
@@ -308,6 +310,7 @@ class DeepseekVL2MultiModalProcessor(
             mm_data_items=mm_data_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
             tokenization_kwargs=tokenization_kwargs,
+            mm_hash_overrides=mm_hash_overrides,
         )
 
 
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index 87e451a2769ea..306775af68065 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -479,6 +479,7 @@ class H2OVLMultiModalProcessor(
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         # The processor logic is different for len(images) <= 1 vs > 1
         # Since the processing cache assumes that the processor output is
@@ -490,6 +491,7 @@ class H2OVLMultiModalProcessor(
                 mm_data_items=mm_data_items,
                 hf_processor_mm_kwargs=hf_processor_mm_kwargs,
                 tokenization_kwargs=tokenization_kwargs,
+                mm_hash_overrides=mm_hash_overrides,
             )
 
         return super()._cached_apply_hf_processor(
@@ -497,6 +499,7 @@ class H2OVLMultiModalProcessor(
             mm_data_items=mm_data_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
             tokenization_kwargs=tokenization_kwargs,
+            mm_hash_overrides=mm_hash_overrides,
         )
 
 
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 0ee26b68345c3..8a847a6180f3a 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -795,6 +795,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> MultiModalInputs:
         hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
@@ -805,8 +806,11 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
             image_height=-1,
         )
 
-        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
-                               tokenization_kwargs)
+        result = super().apply(prompt,
+                               mm_data,
+                               hf_processor_mm_kwargs,
+                               tokenization_kwargs,
+                               mm_hash_overrides=mm_hash_overrides)
 
         mm_items = self._to_mm_items(mm_data)
         mm_item_counts = mm_items.get_all_counts()
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 2a60450de4141..cc2216996f032 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -184,9 +184,13 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> MultiModalEncDecInputs:
-        mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
-                                  tokenization_kwargs)
+        mm_inputs = super().apply(prompt,
+                                  mm_data,
+                                  hf_processor_mm_kwargs,
+                                  tokenization_kwargs,
+                                  mm_hash_overrides=mm_hash_overrides)
 
         image_token_id = self.info.get_hf_config().image_token_index
         # Check that the number of image tokens in the decoder prompt matches
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 95abb190e0a46..b74a09ee92c33 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -203,9 +203,13 @@ class PaliGemmaMultiModalProcessor(
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> MultiModalInputs:
-        mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
-                                  tokenization_kwargs)
+        mm_inputs = super().apply(prompt,
+                                  mm_data,
+                                  hf_processor_mm_kwargs,
+                                  tokenization_kwargs,
+                                  mm_hash_overrides=mm_hash_overrides)
         prompt_token_ids = mm_inputs["prompt_token_ids"]
 
         tokenizer = self.info.get_tokenizer()
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 461b9c85d1c22..a74e01a59697e 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -314,12 +314,14 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         prompt_ids, mm_info, _ = super()._cached_apply_hf_processor(
             prompt=prompt,
             mm_data_items=mm_data_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
             tokenization_kwargs=tokenization_kwargs,
+            mm_hash_overrides=mm_hash_overrides,
         )
 
         # NOTE: The tokens are already inserted by the chat template
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
index 2d14fe6d5892f..2edc357d2df1b 100644
--- a/vllm/model_executor/models/prithvi_geospatial_mae.py
+++ b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -138,6 +138,7 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> MultiModalInputs:
         if "image" in mm_data:
             image_data = mm_data["image"]
@@ -146,8 +147,10 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
             mm_data = {"image": mm_data}
 
         mm_items = self._to_mm_items(mm_data)
-        mm_hashes = self._hash_mm_items(mm_items, hf_processor_mm_kwargs,
-                                        tokenization_kwargs or {})
+        tokenization_kwargs = tokenization_kwargs or {}
+        mm_hashes = (mm_hash_overrides if mm_hash_overrides is not None else
+                     self._hash_mm_items(mm_items, hf_processor_mm_kwargs,
+                                         tokenization_kwargs))
         mm_placeholders = {"image": [PlaceholderRange(offset=0, length=0)]}
 
         mm_processed_data = BatchFeature(image_data)
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index dffc347a73668..edf3dddb1bad2 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -327,6 +327,7 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> MultiModalInputs:
         """
         Process multi-modal inputs to be used in vLLM.
@@ -393,9 +394,11 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
             self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs,
                                        num_image_patches),
         )
+        # Use overrides if provided; fallback to data-dependent hashing.
+        mm_hashes = (mm_hash_overrides if mm_hash_overrides is not None else
+                     self._hash_mm_items(mm_items, hf_processor_mm_kwargs,
+                                         tokenization_kwargs))
 
-        mm_hashes = self._hash_mm_items(mm_items, hf_processor_mm_kwargs,
-                                        tokenization_kwargs)
         return MultiModalInputs(
             type="multimodal",
             prompt=prompt,
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
index 77f11a691e080..eed8d89ca4f5a 100644
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -288,12 +288,14 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo]
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         prompt_ids, mm_info, _ = super()._cached_apply_hf_processor(
             prompt=prompt,
             mm_data_items=mm_data_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
             tokenization_kwargs=tokenization_kwargs,
+            mm_hash_overrides=mm_hash_overrides,
         )
 
         # NOTE: The tokens are already inserted by the chat template
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 6ecdf80d4aa6f..41595df2e2624 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1020,8 +1020,13 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         prompt: str,
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
+        *,
+        mm_hash_overrides: Optional[MultiModalHashes] = None,
     ) -> MultiModalInputs:
-        return self.apply(prompt, mm_data, hf_processor_mm_kwargs)
+        return self.apply(prompt,
+                          mm_data,
+                          hf_processor_mm_kwargs,
+                          mm_hash_overrides=mm_hash_overrides)
 
     def _get_data_parser(self) -> MultiModalDataParser:
         """
@@ -1357,7 +1362,11 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
     ) -> MultiModalHashes:
-        """Create MM hashes to be returned (only used in V1)."""
+        """Create MM hashes to be returned (only used in V1).
+
+        Note: When overrides are provided via callers of `apply`,
+        `_hash_mm_items` will be bypassed and the overrides will be used.
+        """
         model_id = self.info.model_id
 
         return {
@@ -1464,6 +1473,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
+        *,
+        mm_hash_overrides: Optional[MultiModalHashes] = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         (
             prompt_ids,
@@ -1483,8 +1494,10 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
                                        hf_processor_mm_kwargs),
         )
 
-        mm_hashes = self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs,
-                                        tokenization_kwargs)
+        # Use overrides if provided; fallback to data-dependent hashing.
+        mm_hashes = (mm_hash_overrides if mm_hash_overrides is not None else
+                     self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs,
+                                         tokenization_kwargs))
 
         mm_prompt_updates = self._get_mm_prompt_updates(
             mm_data_items,
@@ -1506,6 +1519,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
+        *,
+        mm_hash_overrides: Optional[MultiModalHashes] = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         """
         Apply the HF processor on the full prompt text,
@@ -1520,10 +1535,13 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
                 mm_data_items=mm_data_items,
                 hf_processor_mm_kwargs=hf_processor_mm_kwargs,
                 tokenization_kwargs=tokenization_kwargs,
+                mm_hash_overrides=mm_hash_overrides,
             )
 
-        mm_hashes = self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs,
-                                        tokenization_kwargs)
+        # Use overrides if provided; fallback to data-dependent hashing.
+        mm_hashes = (mm_hash_overrides if mm_hash_overrides is not None else
+                     self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs,
+                                         tokenization_kwargs))
 
         mm_missing_data_items = self._get_cache_missing_items(
             cache=cache,
@@ -1723,6 +1741,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> MultiModalInputs:
         """
         Process multi-modal inputs to be used in vLLM.
@@ -1751,6 +1771,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
             mm_items,
             hf_processor_mm_kwargs,
             tokenization_kwargs=tokenization_kwargs,
+            mm_hash_overrides=mm_hash_overrides,
         )
 
         # NOTE: tokenization_kwargs are not required to init processor
@@ -1835,6 +1856,8 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
+        *,
+        mm_hash_overrides: Optional[MultiModalHashes] = None,
     ) -> MultiModalEncDecInputs:
         """
         Process multi-modal inputs to be used in vLLM.
@@ -1849,6 +1872,7 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
             mm_data,
             hf_processor_mm_kwargs,
             tokenization_kwargs,
+            mm_hash_overrides=mm_hash_overrides,
         )
 
         return self._get_enc_dec_inputs(
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 7ed60156626bf..df915258d8637 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -225,6 +225,41 @@ class Processor:
             # Remember that this backend was set automatically
             params.guided_decoding.backend_was_auto = True
 
+    def _maybe_build_mm_hash_overrides(
+        self,
+        request_id: str,
+        prompt: PromptType,
+    ) -> Optional[dict[str, list[str]]]:
+        """Build per-item multimodal hash overrides when enabled. In this case,
+        multimodal data items are identified by their request id, modality and
+        index rather than their content.
+
+        Returns a dictionary of modality -> list[str] of overrides, or None if
+        disabled or no multimodal data is present.
+        """
+
+        def _extract_mm_data(p: PromptType):
+            if isinstance(p, dict) and "encoder_prompt" in p:
+                enc = p.get("encoder_prompt")
+                if isinstance(enc, dict):
+                    return enc.get("multi_modal_data")
+                return None
+            if isinstance(p, dict):
+                return p.get("multi_modal_data")
+            return None
+
+        mm_data = _extract_mm_data(prompt)
+        if not mm_data:
+            return None
+
+        overrides: dict[str, list[str]] = {}
+        for modality, data in mm_data.items():
+            n = len(data) if isinstance(data, list) else 1
+            overrides[modality] = [
+                f"{request_id}-{modality}-{i}" for i in range(n)
+            ]
+        return overrides
+
     def process_inputs(
         self,
         request_id: str,
@@ -254,6 +289,18 @@ class Processor:
         if arrival_time is None:
             arrival_time = time.time()
 
+        # Optionally generate multimodal hash overrides based on request id.
+        # NOTE: when users explicitly turn off BOTH prefix caching and input
+        # processing caching, no multimodal features or embeddings will be
+        # reused across requests, therefore hashing is no longer necessary.
+        if (self.model_config.multimodal_config and
+                self.model_config.multimodal_config.mm_processor_cache_gb == 0
+                and not self.cache_config.enable_prefix_caching):
+            mm_hash_overrides = self._maybe_build_mm_hash_overrides(
+                request_id, prompt)
+        else:
+            mm_hash_overrides = None
+
         # Process inputs, which includes:
         # 1. Tokenize text prompt, with LoRA request if one exists.
         # 2. For multimodal models with a merged preprocessor, preprocess
@@ -262,6 +309,7 @@ class Processor:
             prompt,
             tokenization_kwargs=tokenization_kwargs,
             lora_request=lora_request,
+            mm_hash_overrides=mm_hash_overrides,
         )
         from vllm.platforms import current_platform
         current_platform.validate_request(

From 853c371fc33e7c99aa2ab9f6e2cd7cbd1cadcf99 Mon Sep 17 00:00:00 2001
From: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com>
Date: Wed, 27 Aug 2025 23:53:30 +0300
Subject: [PATCH 104/112] [V1][Mamba] - Enable V1 by default for Mamba Models
 (#23650)

Signed-off-by: asafg <39553475+Josephasafg@users.noreply.github.com>
---
 .../models/language/generation/test_hybrid.py | 147 ++++++++----------
 vllm/engine/arg_utils.py                      |   5 -
 vllm/model_executor/models/config.py          |   1 +
 3 files changed, 70 insertions(+), 83 deletions(-)

diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index 7e7cc893ec8aa..31ca3a6f0f985 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -100,21 +100,19 @@ def test_models(
         else:
             hf_outputs = None
 
-    if model not in V0_UNSUPPORTED_MODELS:
-        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
-            vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, num_logprobs)
-    else:
-        vllm_v0_outputs = None
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        if model not in V0_UNSUPPORTED_MODELS:
+            with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+                vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
+                    example_prompts, max_tokens, num_logprobs)
+        else:
+            vllm_v0_outputs = None
 
     if model in V1_SUPPORTED_MODELS:
-        with monkeypatch.context() as m:
-            m.setenv("VLLM_USE_V1", "1")
-            with vllm_runner(model,
-                             max_num_seqs=MAX_NUM_SEQS,
-                             enable_prefix_caching=False) as vllm_model:
-                vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
-                    example_prompts, max_tokens, num_logprobs)
+        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+            vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, num_logprobs)
     else:
         vllm_v1_outputs = None
 
@@ -137,7 +135,7 @@ def test_models(
         )
 
 
-@pytest.mark.parametrize("model", SSM_MODELS + HYBRID_MODELS)
+@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
 def test_batching(
@@ -147,10 +145,6 @@ def test_batching(
     max_tokens: int,
     num_logprobs: int,
 ) -> None:
-    if model in V0_UNSUPPORTED_MODELS:
-        pytest.skip(
-            f"Unsupported V0 Engine. Skipping `test_batching` on {model}.")
-
     try:
         model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
         model_info.check_available_online(on_fail="skip")
@@ -188,29 +182,32 @@ def test_chunked_prefill(
     max_tokens: int,
     num_logprobs: int,
     chunked_prefill_token_size: int,
+    monkeypatch,
 ) -> None:
     max_num_seqs = chunked_prefill_token_size
     max_num_batched_tokens = chunked_prefill_token_size
 
-    with vllm_runner(model,
-                     enable_chunked_prefill=True,
-                     max_num_batched_tokens=max_num_batched_tokens,
-                     max_num_seqs=max_num_seqs) as vllm_model:
-        chunked = vllm_model.generate_greedy_logprobs(example_prompts,
-                                                      max_tokens, num_logprobs)
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        with vllm_runner(model,
+                         enable_chunked_prefill=True,
+                         max_num_batched_tokens=max_num_batched_tokens,
+                         max_num_seqs=max_num_seqs) as vllm_model:
+            chunked = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, num_logprobs)
 
-    with vllm_runner(model,
-                     enable_chunked_prefill=False,
-                     max_num_seqs=max_num_seqs) as vllm_model:
-        non_chunked = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+        with vllm_runner(model,
+                         enable_chunked_prefill=False,
+                         max_num_seqs=max_num_seqs) as vllm_model:
+            non_chunked = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, num_logprobs)
 
-    check_logprobs_close(
-        outputs_0_lst=chunked,
-        outputs_1_lst=non_chunked,
-        name_0="chunked",
-        name_1="non_chunked",
-    )
+        check_logprobs_close(
+            outputs_0_lst=chunked,
+            outputs_1_lst=non_chunked,
+            name_0="chunked",
+            name_1="non_chunked",
+        )
 
 
 @pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@@ -281,25 +278,29 @@ def test_models_preemption_recompute(
     example_prompts,
     model: str,
     max_tokens: int,
+    monkeypatch,
 ) -> None:
     """
     Tests that outputs are identical with and w/o preemptions (recompute).
     """
-    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
-        scheduler = vllm_model.llm.llm_engine.scheduler[0]
-        scheduler.ENABLE_ARTIFICIAL_PREEMPT = True
-        preempt_vllm_outputs = vllm_model.generate_greedy(
-            example_prompts, max_tokens)
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+            scheduler = vllm_model.llm.llm_engine.scheduler[0]
+            scheduler.ENABLE_ARTIFICIAL_PREEMPT = True
+            preempt_vllm_outputs = vllm_model.generate_greedy(
+                example_prompts, max_tokens)
 
-        scheduler.ENABLE_ARTIFICIAL_PREEMPT = False
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+            scheduler.ENABLE_ARTIFICIAL_PREEMPT = False
+            vllm_outputs = vllm_model.generate_greedy(example_prompts,
+                                                      max_tokens)
 
-    check_outputs_equal(
-        outputs_0_lst=preempt_vllm_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="vllm_preepmtions",
-        name_1="vllm",
-    )
+        check_outputs_equal(
+            outputs_0_lst=preempt_vllm_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="vllm_preepmtions",
+            name_1="vllm",
+        )
 
 
 @pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@@ -402,24 +403,18 @@ def test_full_cuda_graph(
         else:
             hf_outputs = None
 
-    if model not in V0_UNSUPPORTED_MODELS:
-        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
-            vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, num_logprobs)
-    else:
-        vllm_v0_outputs = None
-
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        if model in HYBRID_MODELS:
-            # required due to reorder_batch behaviour
-            m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
-        with vllm_runner(model,
-                         max_num_seqs=MAX_NUM_SEQS,
-                         compilation_config={'full_cuda_graph': True},
-                         enable_prefix_caching=False) as vllm_model:
-            vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, num_logprobs)
+        m.setenv("VLLM_USE_V1", "0")
+        if model not in V0_UNSUPPORTED_MODELS:
+            with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+                vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
+                    example_prompts, max_tokens, num_logprobs)
+        else:
+            vllm_v0_outputs = None
+
+    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+        vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
 
     if hf_outputs is not None and vllm_v0_outputs is not None:
         check_logprobs_close(
@@ -466,24 +461,20 @@ def test_fp32_state(
         else:
             hf_outputs = None
 
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        with vllm_runner(model,
+                         max_num_seqs=MAX_NUM_SEQS,
+                         mamba_ssm_cache_dtype="float32") as vllm_model:
+            vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, num_logprobs)
+
     with vllm_runner(model,
                      max_num_seqs=MAX_NUM_SEQS,
                      mamba_ssm_cache_dtype="float32") as vllm_model:
-        vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
+        vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
             example_prompts, max_tokens, num_logprobs)
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        if model in HYBRID_MODELS:
-            # required due to reorder_batch behaviour
-            m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
-        with vllm_runner(model,
-                         max_num_seqs=MAX_NUM_SEQS,
-                         mamba_ssm_cache_dtype="float32",
-                         enable_prefix_caching=False) as vllm_model:
-            vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, num_logprobs)
-
     if hf_outputs is not None:
         check_logprobs_close(
             outputs_0_lst=hf_outputs,
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 3399d505e3631..e4d205aeb8633 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1463,11 +1463,6 @@ class EngineArgs:
                                recommend_to_remove=False)
             return False
 
-        # V1 mamba models are unoptimized.
-        if model_config.has_inner_state and _warn_or_fallback(
-                feature_name="Mamba"):
-            return False
-
         # No Concurrent Partial Prefills so far.
         if (self.max_num_partial_prefills
                 != SchedulerConfig.max_num_partial_prefills
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 88b3154de2cbb..b0dbfacece3ab 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -417,4 +417,5 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
     "GptOssForCausalLM": GptOssForCausalLMConfig,
     "MambaForCausalLM": MambaModelConfig,
     "Mamba2ForCausalLM": MambaModelConfig,
+    "FalconMambaForCausalLM": MambaModelConfig,
 }

From 082cc07ef8f810bea61eaed77a60137684ca78f8 Mon Sep 17 00:00:00 2001
From: Yongye Zhu <zyy1102000@gmail.com>
Date: Wed, 27 Aug 2025 17:33:21 -0400
Subject: [PATCH 105/112] DP/EP Support for gpt-oss with deepep-ht comm kernel
 on SM100 (#23608)

---
 .../base_device_communicator.py               |   2 +-
 .../model_executor/layers/fused_moe/config.py |   6 +
 vllm/model_executor/layers/fused_moe/layer.py |   6 +-
 .../layers/fused_moe/trtllm_moe.py            | 197 ++++++++++++++++++
 vllm/model_executor/layers/fused_moe/utils.py |  16 ++
 .../compressed_tensors_moe.py                 |   8 +-
 .../model_executor/layers/quantization/fp8.py |   1 +
 .../layers/quantization/modelopt.py           |   2 +
 .../layers/quantization/mxfp4.py              | 110 ++++++++++
 .../layers/quantization/utils/mxfp4_utils.py  |   9 +-
 .../layers/quantization/utils/mxfp8_utils.py  |  20 ++
 11 files changed, 365 insertions(+), 12 deletions(-)
 create mode 100644 vllm/model_executor/layers/fused_moe/trtllm_moe.py
 create mode 100644 vllm/model_executor/layers/quantization/utils/mxfp8_utils.py

diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py
index 9e5aa4e4c2a89..9131582eef754 100644
--- a/vllm/distributed/device_communicators/base_device_communicator.py
+++ b/vllm/distributed/device_communicators/base_device_communicator.py
@@ -255,7 +255,7 @@ class DeviceCommunicatorBase:
             if module.__class__.__name__ == "FusedMoE"
         ]
         for module in moe_modules:
-            module.quant_method.init_prepare_finalize()
+            module.quant_method.init_prepare_finalize(module)
 
     def dispatch(
             self, hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
index 7c1a7b636a9c2..cab610decf901 100644
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -450,6 +450,12 @@ class FusedMoEConfig:
             if quant_dtype is None and isinstance(quant_config, Fp8Config):
                 quant_dtype = torch.float8_e4m3fn
 
+            from vllm.model_executor.layers.quantization.mxfp4 import (
+                Mxfp4Config)
+            if (quant_dtype is None and isinstance(quant_config, Mxfp4Config)
+                    and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8):
+                quant_dtype = "mxfp8"
+
             from vllm.model_executor.layers.quantization.modelopt import (
                 ModelOptNvFp4Config)
             if quant_dtype is None and isinstance(quant_config,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 54406a5a2d87f..b9de03ddd216e 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -200,7 +200,7 @@ class FusedMoEMethodBase(QuantizeMethodBase):
 
     # Note: init_prepare_finalize should only be called by
     # prepare_communication_buffer_for_model.
-    def init_prepare_finalize(self):
+    def init_prepare_finalize(self, layer: torch.nn.Module):
         assert self.moe is not None
         prepare_finalize = self.maybe_make_prepare_finalize(self.moe)
 
@@ -211,7 +211,7 @@ class FusedMoEMethodBase(QuantizeMethodBase):
             assert self.fused_experts is None, \
                 f"Attempt to override experts for {id(self)}!"
             self.topk_indices_dtype = prepare_finalize.topk_indices_dtype()
-            experts = self.select_gemm_impl(prepare_finalize, self.moe)
+            experts = self.select_gemm_impl(prepare_finalize, self.moe, layer)
             self.fused_experts = FusedMoEModularKernel(
                 prepare_finalize,
                 experts,
@@ -221,6 +221,7 @@ class FusedMoEMethodBase(QuantizeMethodBase):
         self,
         prepare_finalize: FusedMoEPrepareAndFinalize,
         moe: FusedMoEConfig,
+        layer: torch.nn.Module,
     ) -> FusedMoEPermuteExpertsUnpermute:
         # based on the all2all implementation, select the appropriate
         # gemm implementation
@@ -273,6 +274,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         prepare_finalize: FusedMoEPrepareAndFinalize,
         # TODO(bnell): Remove. Every layer should have an moe config object.
         moe: FusedMoEConfig,
+        layer: torch.nn.Module,
     ) -> FusedMoEPermuteExpertsUnpermute:
         if (prepare_finalize.activation_format ==
                 FusedMoEActivationFormat.BatchedExperts):
diff --git a/vllm/model_executor/layers/fused_moe/trtllm_moe.py b/vllm/model_executor/layers/fused_moe/trtllm_moe.py
new file mode 100644
index 0000000000000..14dfce4b0e3aa
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/trtllm_moe.py
@@ -0,0 +1,197 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
+from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
+    TopKWeightAndReduceNoOP)
+from vllm.utils import next_power_of_2
+
+
+class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute):
+
+    def __init__(
+        self,
+        moe: FusedMoEConfig,
+        gemm1_alpha,
+        gemm1_beta,
+        gemm1_clamp_limit,
+        w13_bias,
+        w2_bias,
+        max_capture_size,
+    ):
+        super().__init__(moe.quant_config)
+        self.moe = moe
+        self.gemm1_alpha = gemm1_alpha
+        self.gemm1_beta = gemm1_beta
+        self.gemm1_clamp_limit = gemm1_clamp_limit
+        self.w13_bias = w13_bias
+        self.w2_bias = w2_bias
+        self.max_capture_size = max_capture_size
+
+    @property
+    def activation_formats(
+        self
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (mk.FusedMoEActivationFormat.Standard,
+                mk.FusedMoEActivationFormat.Standard)
+
+    def supports_chunking(self) -> bool:
+        return True
+
+    def supports_expert_map(self) -> bool:
+        return True
+
+    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
+        return TopKWeightAndReduceNoOP()
+
+    def workspace_shapes(
+        self,
+        a: torch.Tensor,
+        aq: torch.Tensor,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
+        # The workspaces for this implementation are managed by flashinfer.
+        # TODO(varun) : workspace1 is could be used as the output tensor. This
+        # is error-prone. Allow the `workspace_shapes` to return None workspaces
+        workspace1 = (M, K)
+        workspace2 = (0, 0)
+        output = (M, K)
+        return (workspace1, workspace2, output, a.dtype)
+
+    def _get_tile_tokens_dim(self, x: torch.Tensor, top_k: int,
+                             local_num_experts: int):
+        # Number of tokens in the input tensor.
+        num_tokens = x.shape[0]
+        # Factor to account for the imbalance of the experts.
+        # factor equals to the
+        # max_real_num_tokens_per_expert / perfect_num_tokens_per_expert
+        # 1.0 means perfect expert distribution.
+        # > 1.0 means some experts have more tokens than the perfect
+        # distribution.
+        # < 1.0 does not make sense.
+        imbalance_factor = 1.3
+        # Calculate the number of tokens per expert assuming perfect
+        # distribution.
+        num_tokens_per_expert = (num_tokens * top_k) // local_num_experts
+        # Apply the imbalance factor.
+        num_tokens_per_expert = int(num_tokens_per_expert * imbalance_factor)
+        # And pad the number to the next power of 2.
+        tile_tokens_dim = next_power_of_2(num_tokens_per_expert)
+        # Cap to 8-64 tokens per CTA tile as it's the range supported by the
+        #  kernel.
+        tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
+
+        return tile_tokens_dim
+
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        apply_router_weight_on_input: bool,
+    ):
+        topk = topk_ids.size(-1)
+        local_num_experts = w1.size(0)
+        intermediate_size = w2.size(1)
+        local_expert_offset = self.moe.ep_rank * local_num_experts
+
+        x_quant = hidden_states
+        x_scale = a1q_scale
+        if x_scale is not None:
+            x_scale = x_scale.view(torch.float8_e4m3fn).reshape(
+                *x_quant.shape[:-1], -1)
+
+        packed_tensor = (topk_ids.to(torch.int32) << 16) | topk_weights.to(
+            torch.bfloat16).view(torch.int16)
+
+        assert w1_scale is not None
+        assert w2_scale is not None
+        kwargs = {
+            "topk_ids":
+            packed_tensor,
+            "routing_bias":
+            None,
+            "hidden_states":
+            x_quant,
+            "hidden_states_scale":
+            x_scale,
+            "gemm1_weights":
+            w1,
+            "gemm1_weights_scale":
+            w1_scale,
+            "gemm1_bias":
+            self.w13_bias,
+            "gemm1_alpha":
+            self.gemm1_alpha,
+            "gemm1_beta":
+            self.gemm1_beta,
+            "gemm1_clamp_limit":
+            self.gemm1_clamp_limit,
+            "gemm2_weights":
+            w2,
+            "gemm2_weights_scale":
+            w2_scale,
+            "gemm2_bias":
+            self.w2_bias,
+            "output1_scale_scalar":
+            None,
+            "output1_scale_gate_scalar":
+            None,
+            "output2_scale_scalar":
+            None,
+            "num_experts":
+            global_num_experts,
+            "top_k":
+            topk,
+            "n_group":
+            None,
+            "topk_group":
+            None,
+            "intermediate_size":
+            intermediate_size,
+            "local_expert_offset":
+            local_expert_offset,
+            "local_num_experts":
+            local_num_experts,
+            "routed_scaling_factor":
+            None,
+            "tile_tokens_dim":
+            self._get_tile_tokens_dim(x_quant, topk, local_num_experts),
+            "routing_method_type":
+            1,
+            "do_finalize":
+            True,
+            "output":
+            output,
+            "tune_max_num_tokens":
+            self.max_capture_size,
+        }
+
+        from flashinfer import trtllm_fp4_block_scale_routed_moe
+        trtllm_fp4_block_scale_routed_moe(**kwargs)
+        return output
diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py
index 4c3e700ad3990..1aeb3f92bc3ea 100644
--- a/vllm/model_executor/layers/fused_moe/utils.py
+++ b/vllm/model_executor/layers/fused_moe/utils.py
@@ -12,6 +12,8 @@ from vllm.model_executor.layers.quantization.utils.int8_utils import (
     per_token_group_quant_int8, per_token_quant_int8)
 from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
     quant_dequant_mxfp4)
+from vllm.model_executor.layers.quantization.utils.mxfp8_utils import (
+    mxfp8_quantize)
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils import cdiv
@@ -177,6 +179,18 @@ def _mxfp4_quantize(
     return A, None
 
 
+def _mxfp8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    per_act_token_quant: bool,
+    block_shape: Optional[list[int]] = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    assert A_scale is None
+    assert not per_act_token_quant
+    assert block_shape is None
+    return mxfp8_quantize(A)
+
+
 def moe_kernel_quantize_input(
     A: torch.Tensor,
     A_scale: Optional[torch.Tensor],
@@ -195,6 +209,8 @@ def moe_kernel_quantize_input(
                              is_sf_swizzled_layout=is_fp4_scale_swizzled)
     elif quant_dtype == "mxfp4":
         return _mxfp4_quantize(A, A_scale, per_act_token_quant, block_shape)
+    elif quant_dtype == "mxfp8":
+        return _mxfp8_quantize(A, A_scale, per_act_token_quant, block_shape)
     else:
         return A, A_scale
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 6279bb8b60570..af9d1c46f68f4 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -322,6 +322,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
         self,
         prepare_finalize: mk.FusedMoEPrepareAndFinalize,
         moe: FusedMoEConfig,
+        layer: torch.nn.Module,
     ) -> mk.FusedMoEPermuteExpertsUnpermute:
         """Return the appropriate GEMM experts implementation."""
         experts = select_nvfp4_gemm_impl(
@@ -719,10 +720,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 dtype=torch.int64)
 
     def select_gemm_impl(
-        self,
-        prepare_finalize: FusedMoEPrepareAndFinalize,
-        moe: FusedMoEConfig,
-    ) -> FusedMoEPermuteExpertsUnpermute:
+            self, prepare_finalize: FusedMoEPrepareAndFinalize,
+            moe: FusedMoEConfig,
+            layer: torch.nn.Module) -> FusedMoEPermuteExpertsUnpermute:
         # cutlass path
         if self.use_cutlass:
             from vllm.model_executor.layers.fused_moe import (
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index be358cfa949f0..0200b0e9ed001 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -897,6 +897,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         self,
         prepare_finalize: FusedMoEPrepareAndFinalize,
         moe: FusedMoEConfig,
+        layer: torch.nn.Module,
     ) -> FusedMoEPermuteExpertsUnpermute:
         from vllm.model_executor.layers.fused_moe import (
             BatchedTritonOrDeepGemmExperts, TritonOrDeepGemmExperts)
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 72864853f7e0c..adce598c4ff1f 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -311,6 +311,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
         self,
         prepare_finalize: mk.FusedMoEPrepareAndFinalize,
         moe: FusedMoEConfig,
+        layer: torch.nn.Module,
     ) -> mk.FusedMoEPermuteExpertsUnpermute:
         experts = select_cutlass_fp8_gemm_impl(
             moe,
@@ -1032,6 +1033,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         self,
         prepare_finalize: mk.FusedMoEPrepareAndFinalize,
         moe: FusedMoEConfig,
+        layer: torch.nn.Module,
     ) -> mk.FusedMoEPermuteExpertsUnpermute:
         experts = select_nvfp4_gemm_impl(
             moe,
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index bdeb169a4b97f..6724796904f01 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -10,6 +10,8 @@ from vllm.config import get_current_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig,
                                                   FusedMoEMethodBase)
+from vllm.model_executor.layers.fused_moe import modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.trtllm_moe import TrtLlmGenExperts
 from vllm.model_executor.layers.linear import (LinearBase,
                                                UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -445,6 +447,91 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
 
         return tile_tokens_dim
 
+    def select_gemm_impl(
+        self,
+        prepare_finalize: mk.FusedMoEPrepareAndFinalize,
+        moe: FusedMoEConfig,
+        layer: torch.nn.Module,
+    ) -> mk.FusedMoEPermuteExpertsUnpermute:
+        if (prepare_finalize.activation_format ==
+                mk.FusedMoEActivationFormat.BatchedExperts):
+            raise NotImplementedError(
+                "Mxfp4 does not support batched experts format for EP")
+        else:
+            if should_use_flashinfer_mxfp4():
+                # B200 code-path
+                kwargs = {
+                    "gemm1_alpha": layer.gemm1_alpha,
+                    "gemm1_beta": layer.gemm1_beta,
+                    "gemm1_clamp_limit": layer.gemm1_clamp_limit,
+                    "w13_bias": layer.w13_bias,
+                    "w2_bias": layer.w2_bias,
+                    "max_capture_size": self.max_capture_size,
+                }
+                return TrtLlmGenExperts(moe, **kwargs)
+            else:
+                # Use matmul_ogs from triton_kernels here!
+                raise NotImplementedError(
+                    "Mxfp4 does not support non-batched experts format for EP")
+
+    def _route_and_experts(
+            self,
+            layer: torch.nn.Module,
+            x: torch.Tensor,
+            router_logits: torch.Tensor,
+            top_k: int,
+            renormalize: bool,
+            use_grouped_topk: bool = False,
+            topk_group: Optional[int] = None,
+            num_expert_group: Optional[int] = None,
+            global_num_experts: int = -1,
+            expert_map: Optional[torch.Tensor] = None,
+            custom_routing_function: Optional[Callable] = None,
+            scoring_func: str = "softmax",
+            e_score_correction_bias: Optional[torch.Tensor] = None,
+            apply_router_weight_on_input: bool = False,
+            activation: str = "silu",
+            enable_eplb: bool = False,
+            expert_load_view: Optional[torch.Tensor] = None,
+            logical_to_physical_map: Optional[torch.Tensor] = None,
+            logical_replica_count: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+
+        assert isinstance(self.fused_experts, mk.FusedMoEModularKernel)
+
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype,
+            enable_eplb=enable_eplb,
+            expert_map=expert_map,
+            expert_load_view=expert_load_view,
+            logical_to_physical_map=logical_to_physical_map,
+            logical_replica_count=logical_replica_count)
+
+        return self.fused_experts(
+            hidden_states=x,
+            w1=layer.w13_weight,
+            w2=layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=True,
+            activation=activation,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+        )
+
     def apply(
         self,
         layer: torch.nn.Module,
@@ -503,6 +590,29 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                 activation=activation,
                 expert_map=expert_map)
 
+        if self.fused_experts is not None:
+            return self._route_and_experts(
+                layer,
+                x,
+                router_logits,
+                top_k,
+                renormalize,
+                use_grouped_topk,
+                topk_group,
+                num_expert_group,
+                global_num_experts,
+                expert_map,
+                custom_routing_function,
+                scoring_func,
+                e_score_correction_bias,
+                apply_router_weight_on_input,
+                activation,
+                enable_eplb,
+                expert_load_view,
+                logical_to_physical_map,
+                logical_replica_count,
+            )
+
         assert _can_support_mxfp4(
             use_grouped_topk, topk_group, num_expert_group, expert_map,
             custom_routing_function, e_score_correction_bias,
diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
index 48f9cc3737e47..3de928fea7202 100644
--- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
@@ -66,11 +66,10 @@ def _can_support_mxfp4(use_grouped_topk: bool = False,
                        logical_to_physical_map: Optional[torch.Tensor] = None,
                        logical_replica_count: Optional[torch.Tensor] = None):
     return not (use_grouped_topk or topk_group or num_expert_group
-                or expert_map or custom_routing_function
-                or e_score_correction_bias or apply_router_weight_on_input
-                or scoring_func != "softmax" or activation != "swigluoai"
-                or expert_load_view or logical_to_physical_map
-                or logical_replica_count)
+                or custom_routing_function or e_score_correction_bias
+                or apply_router_weight_on_input or scoring_func != "softmax"
+                or activation != "swigluoai" or expert_load_view
+                or logical_to_physical_map or logical_replica_count)
 
 
 def _dequant_mxfp4(x: torch.Tensor, scale: torch.Tensor,
diff --git a/vllm/model_executor/layers/quantization/utils/mxfp8_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp8_utils.py
new file mode 100644
index 0000000000000..2a6b21c918f46
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/mxfp8_utils.py
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def mxfp8_quantize(x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+
+    try:
+        from flashinfer import mxfp8_quantize
+    except ImportError as err:
+        raise ImportError("The package `flashinfer` is required to do "
+                          "MX-FP8 quantization. Please install it with" \
+                          "`pip install flashinfer`") from err
+
+    return mxfp8_quantize(x, is_sf_swizzled_layout=False)

From f9ca2b40a0357d98e3fb8bd951745dfaceab459e Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Wed, 27 Aug 2025 17:48:16 -0400
Subject: [PATCH 106/112] [Bugfix] Fix Marlin NVFP4 for modelopt (#23659)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .../layers/quantization/modelopt.py           | 25 +++++++++----------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index adce598c4ff1f..9d4e453ffc545 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -891,7 +891,11 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
         assert (layer.weight_scale.dtype == torch.float8_e4m3fn), (
             "Weight Block scale must be represented as FP8-E4M3")
 
-        if self.backend == "flashinfer-trtllm":
+        if self.backend == "marlin":
+            prepare_fp4_layer_for_marlin(layer)
+            del layer.alpha
+            del layer.input_scale
+        elif self.backend == "flashinfer-trtllm":
             # FlashInfer TRTLLM FP4 GEMM requires a different weight layout.
             # FlashInfer provides nvfp4_quantize to quantize + shuffle the
             # layout but we use our own quantization so we have to call
@@ -916,11 +920,6 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
                                            requires_grad=False)
             layer.weight = Parameter(layer.weight.data, requires_grad=False)
 
-            if self.backend == "marlin":
-                prepare_fp4_layer_for_marlin(layer)
-                del layer.alpha
-                del layer.input_scale
-
     def apply(
         self,
         layer: torch.nn.Module,
@@ -1312,6 +1311,13 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
             del layer.w2_weight_scale
             del layer.w13_weight
             del layer.w13_weight_scale
+        elif self.use_marlin:
+            # Marlin processing
+            prepare_moe_fp4_layer_for_marlin(layer)
+            del layer.g1_alphas
+            del layer.g2_alphas
+            del layer.w13_input_scale_quant
+            del layer.w2_input_scale_quant
         else:
             # Non-TRT-LLM processing (Cutlass or non-flashinfer)
             assert (layer.w13_weight_scale.shape[2] % 16 == 0), (
@@ -1333,13 +1339,6 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
             layer.w2_weight = Parameter(layer.w2_weight.data,
                                         requires_grad=False)
 
-        if self.use_marlin:
-            prepare_moe_fp4_layer_for_marlin(layer)
-            del layer.g1_alphas
-            del layer.g2_alphas
-            del layer.w13_input_scale_quant
-            del layer.w2_input_scale_quant
-
     def apply(
         self,
         layer: torch.nn.Module,

From 321938e9ac4000e0cb37e328359a7fd3026bc672 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Wed, 27 Aug 2025 17:52:24 -0400
Subject: [PATCH 107/112] [Feature] Add `VLLM_DISABLE_PAD_FOR_CUDAGRAPH` to
 Avoid Hang Issue (#23595)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/envs.py                       | 7 +++++++
 vllm/v1/worker/gpu_model_runner.py | 1 +
 2 files changed, 8 insertions(+)

diff --git a/vllm/envs.py b/vllm/envs.py
index 35735b552575b..a6a795dcfcda9 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -166,6 +166,7 @@ if TYPE_CHECKING:
     VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False
     VLLM_ALLREDUCE_USE_SYMM_MEM: bool = False
     VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None
+    VLLM_DISABLE_PAD_FOR_CUDAGRAPH: bool = False
 
 
 def get_default_cache_root():
@@ -1144,6 +1145,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_ENABLE_CUDAGRAPH_GC":
     lambda: bool(int(os.getenv("VLLM_ENABLE_CUDAGRAPH_GC", "0"))),
 
+    # Disable padding to CUDA graph capture batch sizes.
+    # TODO(wentao): https://github.com/vllm-project/vllm/issues/23378
+    # After the issue is fixed, we can remove this flag.
+    "VLLM_DISABLE_PAD_FOR_CUDAGRAPH":
+    lambda: bool(int(os.getenv("VLLM_DISABLE_PAD_FOR_CUDAGRAPH", "0"))),
+
     # Used to force set up loopback IP
     "VLLM_LOOPBACK_IP":
     lambda: os.getenv("VLLM_LOOPBACK_IP", ""),
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 01c90b2ea38d3..a194808e513dd 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1491,6 +1491,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+                and not envs.VLLM_DISABLE_PAD_FOR_CUDAGRAPH
                 and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
             # Use CUDA graphs.
             # Add padding to the batch size.

From 5da4f5d857933329aaca779e3a81f1385c84e34a Mon Sep 17 00:00:00 2001
From: Hanchenli <61769611+Hanchenli@users.noreply.github.com>
Date: Wed, 27 Aug 2025 17:44:52 -0700
Subject: [PATCH 108/112] [Bugfix] Fix for V1 priority scheduling crashes at
 preemption (#23713)

Signed-off-by: Hanchenli <lihanc2002@gmail.com>
---
 tests/v1/core/test_scheduler.py | 91 +++++++++++++++++++++++++++++++--
 vllm/v1/core/sched/scheduler.py |  2 +
 2 files changed, 90 insertions(+), 3 deletions(-)

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 07d7c12a4f5ef..70e8691788045 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1293,7 +1293,8 @@ def create_requests_with_priority(
         mm_positions: Optional[list[list[PlaceholderRange]]] = None,
         max_tokens: int = 16,
         stop_token_ids: Optional[list[int]] = None,
-        prompt_logprobs: Optional[int] = None):
+        prompt_logprobs: Optional[int] = None,
+        starting_idx: int = 0):
     """Create requests with specified priorities and arrival times."""
     assert len(priorities) == num_requests
     if arrival_times is not None:
@@ -1315,8 +1316,8 @@ def create_requests_with_priority(
             mm_position = None
             mm_kwargs = None
         request = Request(
-            request_id=f"{i}",
-            prompt_token_ids=[i] * num_tokens,
+            request_id=f"{i + starting_idx}",
+            prompt_token_ids=[i + starting_idx] * num_tokens,
             sampling_params=sampling_params,
             pooling_params=None,
             multi_modal_kwargs=mm_kwargs,
@@ -1813,3 +1814,87 @@ def test_schedule_skip_tokenizer_init_structured_output_request():
     assert len(output.scheduled_new_reqs) == 0
     assert len(scheduler.running) == 0
     assert len(scheduler.waiting) == 1
+
+
+def test_priority_scheduling_preemption_when_out_of_kv():
+    """Test that priority scheduling preempts lower priority requests
+    when out of KV cache space."""
+    # Create scheduler with very limited memory to force preemption
+    scheduler = create_scheduler_with_priority(
+        max_num_seqs=2,  # Allow multiple requests
+        max_num_batched_tokens=200,
+        num_blocks=5,  # Can hold 64 tokens (first block is null)
+        block_size=16,  # Standard block size
+    )
+
+    # Create a request and schedule it
+    request_low = create_requests_with_priority(
+        num_requests=1,
+        priorities=[1],
+        arrival_times=[0.0],
+        num_tokens=30,
+        starting_idx=0,
+    )[0]
+    scheduler.add_request(request_low)
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == 1
+    assert len(scheduler.waiting) == 0
+    assert len(scheduler.running) == 1
+
+    # Simulate model execution
+    model_output = ModelRunnerOutput(
+        req_ids=[request_low.request_id],
+        req_id_to_index={request_low.request_id: 0},
+        sampled_token_ids=[[100]],
+        # spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    scheduler.update_from_output(output, model_output)
+
+    # Create a high priority request and schedule it
+    request_high = create_requests_with_priority(
+        num_requests=1,
+        priorities=[0],
+        arrival_times=[1.0],
+        num_tokens=32,
+        starting_idx=1,
+    )[0]
+    scheduler.add_request(request_high)
+    output = scheduler.schedule()
+    # KV cache should be full at this point
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == 0
+    assert len(output.scheduled_new_reqs) == 1
+    assert output.scheduled_cached_reqs.num_reqs == 1
+    assert len(scheduler.waiting) == 0
+    assert len(scheduler.running) == 2
+
+    # Simulate model execution
+    requests = [request_low, request_high]
+    model_output = ModelRunnerOutput(
+        req_ids=[req.request_id for req in requests],
+        req_id_to_index={
+            req.request_id: i
+            for i, req in enumerate(requests)
+        },
+        sampled_token_ids=[[100] for _ in requests],
+        # spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    scheduler.update_from_output(output, model_output)
+
+    # Schedule again - this should trigger preemption
+    # req_low needs 32 tokens = 2 blocks
+    # req_high needs 33 tokens = 3 blocks
+    # so doesn't fit in 4 blocks.
+    output = scheduler.schedule()
+
+    # Should have preempted req_low
+    assert len(output.scheduled_new_reqs) == 0
+    assert output.scheduled_cached_reqs.num_reqs == 1
+    assert output.scheduled_cached_reqs.req_ids[0] == request_high.request_id
+    assert len(scheduler.waiting) == 1
+    assert len(scheduler.running) == 1
\ No newline at end of file
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 14a914d8f2f0b..3bd2fe2f0515f 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -253,6 +253,8 @@ class Scheduler(SchedulerInterface):
                             key=lambda r: (r.priority, r.arrival_time),
                         )
                         self.running.remove(preempted_req)
+                        if preempted_req in scheduled_running_reqs:
+                            scheduled_running_reqs.remove(preempted_req)
                     else:
                         preempted_req = self.running.pop()
 

From a69693e38f27f12e5a5d05b6792e590b520ca27b Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Wed, 27 Aug 2025 19:43:26 -0700
Subject: [PATCH 109/112] Migrate Qwen inputs to TensorSchema (#23473)

Signed-off-by: Benji Beck <benjibeck@meta.com>
---
 vllm/model_executor/models/qwen_vl.py | 51 +++++++++++++--------------
 1 file changed, 25 insertions(+), 26 deletions(-)

diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index 2950ca664a98f..90200f319464b 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -11,7 +11,7 @@ import math
 import unicodedata
 from collections.abc import Collection, Mapping, Sequence, Set
 from functools import lru_cache, partial
-from typing import Callable, Literal, Optional, TypedDict, Union
+from typing import Annotated, Callable, Literal, Optional, Union
 
 import regex as re
 import torch
@@ -40,6 +40,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
                          SupportsMultiModal, SupportsPP)
@@ -47,26 +48,34 @@ from .qwen import QWenBaseModel, QWenModel
 from .utils import flatten_bn, merge_multimodal_embeddings
 
 
-class QwenImagePixelInputs(TypedDict):
-    type: Literal["pixel_values"]
-    data: torch.Tensor
+class QwenImagePixelInputs(TensorSchema):
     """
-    Shape: `(batch_size * num_images, 3, image_size, image_size)`
-
+    Dimensions:
+        - bn: Batch size * number of images
+        - c: Number of channels (3)
+        - h: Height
+        - w: Width
+    
     Note that image_size is the value in the vision config to which we resize
     the image to in the normalization transform. Currently multi-image support
     can only be leveraged by passing image embeddings directly.
     """
+    type: Literal["pixel_values"] = "pixel_values"
+    data: Annotated[torch.Tensor, TensorShape("bn", 3, "h", "w")]
 
 
-class QwenImageEmbeddingInputs(TypedDict):
-    type: Literal["image_embeds"]
-    data: torch.Tensor
-    """Shape: `(batch_size * num_images, 256, hidden_size)`
-
+class QwenImageEmbeddingInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size * number of images
+        - ifs: Image feature size (256)
+        - hs: Hidden size
+    
     `hidden_size` must match the hidden size of the language model backbone
     and is stored in the visual config of the model if we have one.
     """
+    type: Literal["image_embeds"] = "image_embeds"
+    data: Annotated[torch.Tensor, TensorShape("bn", 256, "hs")]
 
 
 QwenImageInputs = Union[QwenImagePixelInputs, QwenImageEmbeddingInputs]
@@ -697,19 +706,6 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
 
         self.transformer: QwenVLModel
 
-    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
-        h = w = self.config.visual["image_size"]
-        expected_dims = (3, h, w)
-        actual_dims = tuple(data.shape[1:])
-
-        if actual_dims != expected_dims:
-            expected_expr = ("batch_size", *map(str, expected_dims))
-            raise ValueError(
-                f"The expected shape of pixel values is {expected_expr}. "
-                f"You supplied {tuple(data.shape)}.")
-
-        return data
-
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[QwenImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
@@ -720,10 +716,13 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
                 raise ValueError("Incorrect type of pixel values. "
                                  f"Got type: {type(pixel_values)}")
 
+            expected_h = expected_w = self.config.visual["image_size"]
+            resolve_bindings = {"h": expected_h, "w": expected_w}
+
             return QwenImagePixelInputs(
                 type="pixel_values",
-                data=self._validate_pixel_values(
-                    flatten_bn(pixel_values, concat=True)),
+                data=flatten_bn(pixel_values, concat=True),
+                resolve_bindings=resolve_bindings,
             )
 
         if image_embeds is not None:

From 1b7b161a09289214eea41e17895a68a7ccd4b1dc Mon Sep 17 00:00:00 2001
From: Shrey Gupta <66182248+Shrey1306@users.noreply.github.com>
Date: Thu, 28 Aug 2025 08:42:44 +0530
Subject: [PATCH 110/112] [Feature] models: pass layer prefix to
 replace_linear_class for per-layer quantization routing. Addresses #23239
 (#23556)

Signed-off-by: Shrey Gupta <shreyg1303@gmail.com>
---
 vllm/model_executor/models/deepseek_vl2.py | 12 ++++++++----
 vllm/model_executor/models/transformers.py | 14 ++++++++++----
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 1bd2802a86838..5eab02b17151c 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -408,13 +408,17 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
             if isinstance(module, nn.Linear):
                 parent, attr_name = self._get_parent_and_attr(vit, name)
                 if isinstance(parent, timm.layers.Mlp) and attr_name == "fc1":
-                    new_linear = replace_linear_class(module, "colwise",
-                                                      quant_config)
+                    new_linear = replace_linear_class(module,
+                                                      "colwise",
+                                                      quant_config,
+                                                      prefix=name)
                     setattr(parent, attr_name, new_linear)
                 elif isinstance(parent,
                                 timm.layers.Mlp) and attr_name == "fc2":
-                    new_linear = replace_linear_class(module, "rowwise",
-                                                      quant_config)
+                    new_linear = replace_linear_class(module,
+                                                      "rowwise",
+                                                      quant_config,
+                                                      prefix=name)
                     setattr(parent, attr_name, new_linear)
 
         return vit
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index edf3dddb1bad2..f7ced6134da52 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -106,8 +106,11 @@ def can_enable_torch_compile(vllm_config: VllmConfig) -> bool:
 
 
 def replace_linear_class(
-    linear: nn.Linear, style: Literal["colwise", "rowwise"],
-    quant_config: QuantizationConfig
+    linear: nn.Linear,
+    style: Literal["colwise", "rowwise"],
+    quant_config: QuantizationConfig,
+    *,
+    prefix: str = "",
 ) -> Union[ColumnParallelLinear, RowParallelLinear, ReplicatedLinear]:
     """
     Replace nn.Linear with one of vLLM's tensor parallel linear classes.
@@ -141,6 +144,7 @@ def replace_linear_class(
         output_size=linear.out_features,
         bias=linear.bias is not None,
         quant_config=quant_config,
+        prefix=prefix,
         return_bias=False,
         **vllm_linear_kwargs,
     )
@@ -557,8 +561,10 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
                     generator = (p for p in tp_plan if re.match(p, qual_name))
                     pattern = next(generator, None)
                     style = tp_plan.get(pattern, "replicate")
-                    new_module = replace_linear_class(child_module, style,
-                                                      self.quant_config)
+                    new_module = replace_linear_class(child_module,
+                                                      style,
+                                                      self.quant_config,
+                                                      prefix=qual_name)
                     setattr(module, child_name, new_module)
                     log_replacement(qual_name, child_module, new_module)
                 else:

From a781e84ec25b1d1b6c245f2e8ffec6e10bafdaa1 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Wed, 27 Aug 2025 23:12:53 -0400
Subject: [PATCH 111/112] [Perf] Tune configs for triton block fp8 gemm
 H100/H200 (#23748)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 benchmarks/kernels/bench_block_fp8_gemm.py    | 113 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  90 +++++------
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  82 +++++-----
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  62 ++++----
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  54 +++----
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  82 +++++-----
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  84 +++++-----
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 118 +++++++-------
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 132 ++++++++--------
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  82 +++++-----
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  76 ++++-----
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  60 +++----
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 100 ++++++------
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 108 ++++++-------
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  76 ++++-----
 21 files changed, 1592 insertions(+), 603 deletions(-)
 create mode 100644 benchmarks/kernels/bench_block_fp8_gemm.py
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json

diff --git a/benchmarks/kernels/bench_block_fp8_gemm.py b/benchmarks/kernels/bench_block_fp8_gemm.py
new file mode 100644
index 0000000000000..883f0cf7e55f1
--- /dev/null
+++ b/benchmarks/kernels/bench_block_fp8_gemm.py
@@ -0,0 +1,113 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    w8a8_block_fp8_matmul,
+)
+from vllm.platforms import current_platform
+from vllm.triton_utils import triton as vllm_triton
+
+assert current_platform.is_cuda(), (
+    "Only support benchmarking w8a8 block fp8 kernel on CUDA device."
+)
+
+# DeepSeek-V3 weight shapes
+DEEPSEEK_V3_SHAPES = [
+    (512 + 64, 7168),
+    ((128 + 64) * 128, 7168),
+    (128 * (128 + 128), 512),
+    (7168, 16384),
+    (7168, 18432),
+    (18432 * 2, 7168),
+    (24576, 1536),
+    (12288, 7168),
+    (4096, 7168),
+    (7168, 2048),
+]
+
+
+def build_w8a8_block_fp8_runner(M, N, K, block_size, device):
+    """Build runner function for w8a8 block fp8 matmul."""
+    factor_for_scale = 1e-2
+
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+    # Create random FP8 tensors
+    A_fp32 = (torch.rand(M, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
+    A = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+    B_fp32 = (torch.rand(N, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
+    B = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+    # Create scales
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+
+    As = torch.rand(M, k_tiles, dtype=torch.float32, device=device) * factor_for_scale
+    Bs = (
+        torch.rand(n_tiles, k_tiles, dtype=torch.float32, device=device)
+        * factor_for_scale
+    )
+
+    def run():
+        return w8a8_block_fp8_matmul(A, B, As, Bs, block_size, torch.bfloat16)
+
+    return run
+
+
+@vllm_triton.testing.perf_report(
+    vllm_triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
+        x_log=False,
+        line_arg="provider",
+        line_vals=["torch-bf16", "w8a8-block-fp8"],
+        line_names=["torch-bf16", "w8a8-block-fp8"],
+        ylabel="TFLOP/s (larger is better)",
+        plot_name="BF16 vs W8A8 Block FP8 GEMMs",
+        args={},
+    )
+)
+def benchmark_tflops(batch_size, provider, N, K, block_size=(128, 128)):
+    M = batch_size
+    device = "cuda"
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if provider == "torch-bf16":
+        a = torch.randn((M, K), device=device, dtype=torch.bfloat16)
+        b = torch.randn((N, K), device=device, dtype=torch.bfloat16)
+        ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
+            lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
+        )
+    else:  # w8a8-block-fp8
+        run_w8a8 = build_w8a8_block_fp8_runner(M, N, K, block_size, device)
+        ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
+            lambda: run_w8a8(), quantiles=quantiles
+        )
+
+    to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
+    return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
+
+
+if __name__ == "__main__":
+    block_size = (128, 128)
+
+    for N, K in DEEPSEEK_V3_SHAPES:
+        print(f"\nBenchmarking DeepSeek-V3, N={N} K={K}")
+
+        print(f"TFLOP/s comparison (block_size={block_size}):")
+        benchmark_tflops.run(
+            print_data=True,
+            # show_plots=False,
+            # save_path=f"bench_w8a8_block_fp8_tflops_n{N}_k{K}",
+            N=N,
+            K=K,
+            block_size=block_size,
+        )
+
+    print("\nBenchmark finished!")
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..0ea0225c96af1
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..be487f2805b85
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..f74a52fc17c9d
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..8cab1b093276a
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
index 1c61451fb34e5..ae244f90bb064 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,73 +1,73 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
         "num_stages": 4
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 4
-    },
-    "4": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "8": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "16": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
-    "24": {
-        "BLOCK_SIZE_M": 64,
+    "4": {
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
     },
     "32": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "48": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "64": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -75,7 +75,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -83,7 +83,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -91,7 +91,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -107,7 +107,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -115,15 +115,15 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "2048": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -133,13 +133,13 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "4096": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     }
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
index 63e661c80de6a..b2931d68f488a 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,83 +1,83 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
         "num_stages": 4
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 8,
-        "num_stages": 4
-    },
-    "4": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
-    "8": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "16": {
-        "BLOCK_SIZE_M": 64,
+    "4": {
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
     },
     "24": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "32": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "48": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "64": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "96": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "128": {
         "BLOCK_SIZE_M": 64,
@@ -91,7 +91,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -99,9 +99,9 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "1024": {
         "BLOCK_SIZE_M": 64,
@@ -115,7 +115,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -139,8 +139,8 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     }
-}
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
index 56b939e52fac3..ad630f0d787cf 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,30 +1,30 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 256,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 4
-    },
-    "2": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 8,
-        "num_stages": 4
-    },
-    "4": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 8,
         "num_stages": 3
     },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
     "8": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
@@ -32,19 +32,19 @@
         "num_stages": 3
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 2
     },
     "24": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_N": 256,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
-        "num_warps": 4,
+        "num_warps": 8,
         "num_stages": 3
     },
     "32": {
@@ -59,9 +59,9 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "64": {
         "BLOCK_SIZE_M": 64,
@@ -75,7 +75,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -83,7 +83,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -91,7 +91,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -123,7 +123,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -139,7 +139,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     }
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
index 63d9a0bf5d79d..10b940c04fad3 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,50 +1,50 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 8,
         "num_stages": 4
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 8,
-        "num_stages": 4
-    },
-    "4": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 256,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
         "num_stages": 3
     },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
     "8": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 2
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 5
     },
     "24": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_N": 256,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
         "num_stages": 3
     },
     "32": {
@@ -59,15 +59,15 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 2
     },
     "64": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -75,7 +75,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -91,7 +91,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -139,7 +139,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     }
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
index 7fa398c15a2a5..94ce6e77f09ce 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,55 +1,55 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 5
-    },
-    "2": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 5
-    },
-    "4": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
         "num_stages": 3
     },
-    "8": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 3
     },
-    "16": {
-        "BLOCK_SIZE_M": 64,
+    "4": {
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     },
-    "24": {
-        "BLOCK_SIZE_M": 64,
+    "8": {
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     },
     "32": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
@@ -59,31 +59,31 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "64": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "96": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "128": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -99,7 +99,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -107,7 +107,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -123,7 +123,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -131,7 +131,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
index f15d8f64c7090..9540df407975e 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,57 +1,57 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 8,
         "num_stages": 3
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 5
-    },
-    "4": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
     },
     "8": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 3
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 3
     },
     "24": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 3
     },
     "32": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -59,33 +59,33 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "64": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "96": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "128": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
     },
     "256": {
         "BLOCK_SIZE_M": 64,
@@ -93,23 +93,23 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "512": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "1024": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "1536": {
         "BLOCK_SIZE_M": 64,
@@ -133,7 +133,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "4096": {
         "BLOCK_SIZE_M": 64,
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..96f6c307b357d
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..567675787d4f9
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
index 51e237b91b8e7..0894ff2fa3322 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,6 +1,6 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
@@ -8,55 +8,55 @@
         "num_stages": 5
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "4": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 4
     },
-    "8": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "16": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 4
-    },
     "24": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "32": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 4
     },
     "48": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
@@ -64,83 +64,83 @@
         "num_stages": 4
     },
     "64": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "96": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 5
     },
     "128": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 4
     },
     "256": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "512": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "1024": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     },
-    "3072": {
+    "1536": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
+        "num_stages": 5
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
         "num_stages": 3
     },
     "4096": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     }
 }
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
index 6280219c9ee7d..86c68e08a1a6a 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,78 +1,78 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 4
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "4": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "8": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "24": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 4
     },
     "32": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "48": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "64": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
     },
     "96": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
@@ -80,38 +80,14 @@
         "num_stages": 5
     },
     "128": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "256": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
-    "512": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "1536": {
+    "256": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
@@ -119,19 +95,43 @@
         "num_warps": 4,
         "num_stages": 5
     },
-    "2048": {
+    "512": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
     "3072": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -139,7 +139,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     }
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
index 0a1e14cffbb2a..af1a384cbcbd3 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,14 +1,14 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 1,
         "num_warps": 8,
         "num_stages": 5
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
@@ -16,26 +16,26 @@
         "num_stages": 5
     },
     "4": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 5
     },
     "8": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 5
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 5
     },
@@ -43,9 +43,9 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 4
     },
     "32": {
         "BLOCK_SIZE_M": 64,
@@ -59,7 +59,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 4
     },
@@ -67,31 +67,31 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 5
     },
     "96": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "128": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 4
     },
     "256": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -101,25 +101,9 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "1024": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "2048": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
@@ -127,13 +111,29 @@
         "num_warps": 4,
         "num_stages": 3
     },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
     "3072": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "4096": {
         "BLOCK_SIZE_M": 64,
@@ -141,6 +141,6 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     }
 }
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
index 15b1c93f60fc5..d381764a26414 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,22 +1,22 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 5
     },
-    "2": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 5
-    },
     "4": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
@@ -24,18 +24,18 @@
         "num_stages": 5
     },
     "8": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 5
     },
@@ -45,47 +45,47 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "32": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "48": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "64": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "96": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 5
     },
     "128": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 4
     },
     "256": {
         "BLOCK_SIZE_M": 64,
@@ -93,29 +93,29 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "512": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "1024": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "1536": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -123,7 +123,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -133,7 +133,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "4096": {
         "BLOCK_SIZE_M": 64,
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
index 8ff12e64c172f..821ad0c704573 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,43 +1,43 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
         "num_stages": 5
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 5
     },
     "4": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 5
     },
     "8": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 5
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "24": {
         "BLOCK_SIZE_M": 64,
@@ -45,7 +45,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 4
     },
     "32": {
         "BLOCK_SIZE_M": 64,
@@ -59,7 +59,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 5
     },
@@ -73,19 +73,19 @@
     },
     "96": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 4
     },
     "128": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 4
     },
     "256": {
         "BLOCK_SIZE_M": 64,
@@ -99,21 +99,21 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
     "1024": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
     "1536": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
@@ -123,9 +123,9 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "3072": {
         "BLOCK_SIZE_M": 64,
@@ -133,7 +133,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "4096": {
         "BLOCK_SIZE_M": 64,
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
index 4532f93681e2b..daaf21c286553 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,67 +1,67 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 5
     },
-    "2": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 5
-    },
-    "4": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 5
-    },
     "8": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "16": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
     },
     "24": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "32": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "48": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "64": {
         "BLOCK_SIZE_M": 64,
@@ -73,25 +73,25 @@
     },
     "96": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 4
     },
     "128": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 4
     },
     "256": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -99,31 +99,31 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "1024": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "1536": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "2048": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -133,7 +133,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "4096": {
         "BLOCK_SIZE_M": 64,
@@ -141,6 +141,6 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     }
 }
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
index ca7f32b9552b4..2583b5a3441ca 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,57 +1,57 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 8,
-        "num_stages": 5
-    },
-    "2": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 5
-    },
-    "4": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 5
+        "num_warps": 8,
+        "num_stages": 3
     },
-    "8": {
-        "BLOCK_SIZE_M": 64,
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 3
     },
-    "16": {
-        "BLOCK_SIZE_M": 64,
+    "8": {
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 4
     },
     "24": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 4
     },
     "32": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 4
     },
@@ -59,43 +59,35 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "64": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 4
     },
     "96": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 4
     },
     "128": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "256": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
-    "512": {
+    "256": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
@@ -103,19 +95,27 @@
         "num_warps": 4,
         "num_stages": 3
     },
-    "1024": {
+    "512": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
     "1536": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -123,7 +123,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -131,7 +131,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -139,8 +139,8 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     }
 }
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
index 5acea242cc0ad..baa64f8d3d141 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,65 +1,65 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 8,
-        "num_stages": 4
+        "num_stages": 5
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 3
     },
     "4": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 3
     },
-    "8": {
-        "BLOCK_SIZE_M": 64,
+    "16": {
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 4
-    },
-    "16": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "24": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "32": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "48": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 4
     },
@@ -69,21 +69,21 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "96": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "128": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 4
     },
@@ -91,7 +91,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -99,13 +99,13 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
     "1024": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
@@ -115,7 +115,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -123,7 +123,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -131,15 +131,15 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "4096": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     }

From a11adafdcab61c059d2a76d952367a722e1b71d5 Mon Sep 17 00:00:00 2001
From: Jan Kessler <Ithanil@users.noreply.github.com>
Date: Thu, 28 Aug 2025 05:14:00 +0200
Subject: [PATCH 112/112] Gracefully handle edge cases in harmony utils
 (#23155)

Signed-off-by: Jan Kessler <jakessle@uni-mainz.de>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/entrypoints/harmony_utils.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py
index bc810f683f4a4..078d316844257 100644
--- a/vllm/entrypoints/harmony_utils.py
+++ b/vllm/entrypoints/harmony_utils.py
@@ -155,7 +155,7 @@ def parse_chat_input(chat_msg) -> Message:
         contents = [TextContent(text=content)]
     else:
         # TODO: Support refusal.
-        contents = [TextContent(text=c["text"]) for c in content]
+        contents = [TextContent(text=c.get("text", "")) for c in content]
     msg = Message.from_role_and_contents(role, contents)
     return msg
 
@@ -218,8 +218,8 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]:
             )
             output_items.append(reasoning_item)
     elif message.channel == "commentary":
-        if message.recipient.startswith("functions."):
-            function_name = message.recipient.split(".")[-1]
+        if recipient is not None and recipient.startswith("functions."):
+            function_name = recipient.split(".")[-1]
             for content in message.content:
                 random_id = random_uuid()
                 response_item = ResponseFunctionToolCall(
@@ -230,8 +230,8 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]:
                     id=f"ft_{random_id}",
                 )
                 output_items.append(response_item)
-        elif message.recipient.startswith(
-                "python") or message.recipient.startswith("browser"):
+        elif recipient is not None and (recipient.startswith("python")
+                                        or recipient.startswith("browser")):
             for content in message.content:
                 reasoning_item = ResponseReasoningItem(
                     id=f"rs_{random_uuid()}",
@@ -245,7 +245,7 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]:
                 )
                 output_items.append(reasoning_item)
         else:
-            raise ValueError(f"Unknown recipient: {message.recipient}")
+            raise ValueError(f"Unknown recipient: {recipient}")
     elif message.channel == "final":
         contents = []
         for content in message.content: