From 2a167b2eeb993638c198db49f3927bae5d55508b Mon Sep 17 00:00:00 2001 From: 22quinn <33176974+22quinn@users.noreply.github.com> Date: Mon, 25 Aug 2025 09:25:52 -0700 Subject: [PATCH 001/112] [test][RL] Add sleep level 2 test and fix reload with sleep mode (#23521) Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com> --- tests/basic_correctness/test_cumem.py | 31 +++++++++++++++++++++++++++ vllm/v1/worker/gpu_worker.py | 3 +-- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py index 34f9389c82a9b..f3ad680b72b55 100644 --- a/tests/basic_correctness/test_cumem.py +++ b/tests/basic_correctness/test_cumem.py @@ -177,3 +177,34 @@ def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool): # cmp output assert output[0].outputs[0].text == output3[0].outputs[0].text + + +@create_new_process_for_each_test() +def test_deep_sleep(): + model = "Qwen/Qwen3-0.6B" + free, total = torch.cuda.mem_get_info() + used_bytes_baseline = total - free # in case other process is running + llm = LLM(model, enable_sleep_mode=True) + prompt = "How are you?" + sampling_params = SamplingParams(temperature=0, max_tokens=10) + output = llm.generate(prompt, sampling_params) + + # Put the engine to deep sleep + llm.sleep(level=2) + + free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info() + used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline + assert used_bytes < 3 * GiB_bytes + + llm.wake_up(tags=["weights"]) + llm.collective_rpc("reload_weights") + free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info() + used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline + assert used_bytes < 4 * GiB_bytes + + # now allocate kv cache and cuda graph memory + llm.wake_up(tags=["kv_cache"]) + output2 = llm.generate(prompt, sampling_params) + + # cmp output + assert output[0].outputs[0].text == output2[0].outputs[0].text diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index f83a4f4faeb5e..1688b8b83e873 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -216,8 +216,7 @@ class Worker(WorkerBase): self.model_runner.update_config(overrides) def reload_weights(self) -> None: - with self._maybe_get_memory_pool_context(tag="weights"): - self.model_runner.reload_weights() + self.model_runner.reload_weights() @torch.inference_mode() def determine_available_memory(self) -> int: From 8a3cd90af534c39425ebfdfd295eea0a4582d541 Mon Sep 17 00:00:00 2001 From: Xin Yang <105740670+xyang16@users.noreply.github.com> Date: Mon, 25 Aug 2025 11:47:52 -0700 Subject: [PATCH 002/112] [Kernel] Add fused grouped_topk kernel for MoE (#23274) Signed-off-by: Xin Yang Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> --- CMakeLists.txt | 4 +- csrc/moe/grouped_topk_kernels.cu | 757 ++++++++++++++++++ csrc/moe/moe_ops.h | 5 + csrc/moe/torch_bindings.cpp | 6 + tests/kernels/moe/test_grouped_topk.py | 76 ++ vllm/_custom_ops.py | 11 + vllm/envs.py | 6 + .../layers/fused_moe/fused_moe.py | 46 +- 8 files changed, 909 insertions(+), 2 deletions(-) create mode 100644 csrc/moe/grouped_topk_kernels.cu create mode 100644 tests/kernels/moe/test_grouped_topk.py diff --git a/CMakeLists.txt b/CMakeLists.txt index aca42c3fe5553..b0ed4a284db95 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -817,7 +817,9 @@ set(VLLM_MOE_EXT_SRC "csrc/moe/topk_softmax_kernels.cu") if(VLLM_GPU_LANG STREQUAL "CUDA") - list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu") + list(APPEND VLLM_MOE_EXT_SRC + "csrc/moe/moe_wna16.cu" + "csrc/moe/grouped_topk_kernels.cu") endif() if(VLLM_GPU_LANG STREQUAL "CUDA") diff --git a/csrc/moe/grouped_topk_kernels.cu b/csrc/moe/grouped_topk_kernels.cu new file mode 100644 index 0000000000000..78f7b3cc1aa25 --- /dev/null +++ b/csrc/moe/grouped_topk_kernels.cu @@ -0,0 +1,757 @@ +/* + * Adapted from + * https://github.com/NVIDIA/TensorRT-LLM/blob/v0.21.0/cpp/tensorrt_llm/kernels/noAuxTcKernels.cu + * Copyright (c) 2025, The vLLM team. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & + * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include +namespace cg = cooperative_groups; + +namespace vllm { +namespace moe { + +constexpr unsigned FULL_WARP_MASK = 0xffffffff; +constexpr int32_t WARP_SIZE = 32; +constexpr int32_t BLOCK_SIZE = 512; +constexpr int32_t NUM_WARPS_PER_BLOCK = BLOCK_SIZE / WARP_SIZE; + +namespace warp_topk { + +template +__host__ __device__ constexpr T round_up_to_multiple_of(T len) { + if (len == 0) { + return 0; + } + return ((len - 1) / size + 1) * size; +} + +template +constexpr __host__ __device__ bool isPowerOf2(T v) { + return (v && !(v & (v - 1))); +} + +template +__forceinline__ __device__ bool is_better_than(T val, T baseline) { + return (val > baseline && greater) || (val < baseline && !greater); +} + +template +__forceinline__ __device__ bool is_better_than(T val, T baseline, idxT index, + idxT baseline_index) { + bool res = (val > baseline && greater) || (val < baseline && !greater); + if (val == baseline) { + res = (index < baseline_index && greater) || + (index < baseline_index && !greater); + } + return res; +} + +template +int calc_smem_size_for_block_wide(int num_of_warp, int64_t k) { + int64_t cache_topk = (sizeof(T) + sizeof(idxT)) * num_of_warp * k; + int64_t n = std::max(num_of_warp / 2 * k, num_of_warp * WARP_SIZE); + return max(cache_topk, + round_up_to_multiple_of<256>(n * sizeof(T)) + n * sizeof(idxT)); +} + +template +struct BitonicMerge { + // input should be a bitonic sequence, and sort it to be a monotonic sequence + __device__ static void merge(T* __restrict__ val_arr, + idxT* __restrict__ idx_arr) { + static_assert(isPowerOf2(size)); + static_assert(size >= 2 * WARP_SIZE); + constexpr int arr_len = size / WARP_SIZE; + + constexpr int stride = arr_len / 2; + for (int i = 0; i < stride; ++i) { + int const other_i = i + stride; + T& val = val_arr[i]; + T& other_val = val_arr[other_i]; + bool is_better; + if constexpr (is_stable) { + is_better = is_better_than(val, other_val, idx_arr[i], + idx_arr[other_i]); + } else { + is_better = is_better_than(val, other_val); + } + + if (is_better) { + T tmp = val; + val = other_val; + other_val = tmp; + + idxT tmp2 = idx_arr[i]; + idx_arr[i] = idx_arr[other_i]; + idx_arr[other_i] = tmp2; + } + } + + BitonicMerge::merge( + val_arr, idx_arr); + BitonicMerge::merge( + val_arr + arr_len / 2, idx_arr + arr_len / 2); + } +}; + +template +struct BitonicSort { + __device__ static void sort(T* __restrict__ val_arr, + idxT* __restrict__ idx_arr) { + static_assert(isPowerOf2(size)); + static_assert(size >= 2 * WARP_SIZE); + constexpr int arr_len = size / WARP_SIZE; + + BitonicSort::sort(val_arr, idx_arr); + BitonicSort::sort( + val_arr + arr_len / 2, idx_arr + arr_len / 2); + BitonicMerge::merge( + val_arr, idx_arr); + } +}; + +template +struct BitonicSort<32, ascending, T, idxT, is_stable> { + __device__ static void sort(T* __restrict__ val_arr, + idxT* __restrict__ idx_arr) { + int const lane = threadIdx.x % WARP_SIZE; + + // ascending doesn't matter before merging since all we need is a bitonic + // sequence + for (int stage = 0; stage < 4; ++stage) { + for (int stride = (1 << stage); stride > 0; stride /= 2) { + bool reverse = (lane >> stage) & 2; + bool is_second = lane & stride; + + T other = __shfl_xor_sync(FULL_WARP_MASK, *val_arr, stride); + idxT other_idx = __shfl_xor_sync(FULL_WARP_MASK, *idx_arr, stride); + + bool is_better; + if constexpr (is_stable) { + if constexpr (ascending) { + is_better = ((*val_arr > other) || + ((*val_arr == other) && (*idx_arr < other_idx))) != + (reverse != is_second); + } else { + is_better = ((*val_arr > other) || + ((*val_arr == other) && (*idx_arr > other_idx))) != + (reverse != is_second); + } + } else { + is_better = (*val_arr != other && + (*val_arr > other) != (reverse != is_second)); + } + if (is_better) { + *val_arr = other; + *idx_arr = other_idx; + } + } + } + + BitonicMerge<32, ascending, ascending, T, idxT, is_stable>::merge(val_arr, + idx_arr); + } +}; + +template +struct BitonicMerge<32, ascending, reverse, T, idxT, is_stable> { + __device__ static void merge(T* __restrict__ val_arr, + idxT* __restrict__ idx_arr) { + int const lane = threadIdx.x % WARP_SIZE; + for (int stride = WARP_SIZE / 2; stride > 0; stride /= 2) { + bool is_second = lane & stride; + T& val = *val_arr; + T other = __shfl_xor_sync(FULL_WARP_MASK, val, stride); + idxT& idx = *idx_arr; + idxT other_idx = __shfl_xor_sync(FULL_WARP_MASK, idx, stride); + + bool is_better; + if constexpr (is_stable) { + if constexpr (ascending) { + is_better = ((*val_arr > other) || + ((*val_arr == other) && (*idx_arr < other_idx))) == + (reverse != is_second); // for min + } else { + is_better = ((*val_arr > other) || + ((*val_arr == other) && (*idx_arr > other_idx))) == + (reverse != is_second); // for max + } + } else { + is_better = + (val != other && ((val > other) == (ascending != is_second))); + } + + if (is_better) { + val = other; + idx = other_idx; + } + } + } +}; + +template +class WarpSort { + public: + __device__ WarpSort(idxT k, T dummy) + : lane_(threadIdx.x % WARP_SIZE), k_(k), dummy_(dummy) { + static_assert(capacity >= WARP_SIZE && isPowerOf2(capacity)); + + for (int i = 0; i < max_arr_len_; ++i) { + val_arr_[i] = dummy_; + idx_arr_[i] = 0; + } + } + + // load and merge k sorted values + __device__ void load_sorted(T const* __restrict__ in, + idxT const* __restrict__ in_idx, idxT start) { + idxT idx = start + WARP_SIZE - 1 - lane_; + for (int i = max_arr_len_ - 1; i >= 0; --i, idx += WARP_SIZE) { + if (idx < start + k_) { + T t = in[idx]; + bool is_better; + if constexpr (is_stable) { + is_better = + is_better_than(t, val_arr_[i], in_idx[idx], idx_arr_[i]); + } else { + is_better = is_better_than(t, val_arr_[i]); + } + if (is_better) { + val_arr_[i] = t; + idx_arr_[i] = in_idx[idx]; + } + } + } + + BitonicMerge::merge( + val_arr_, idx_arr_); + } + + __device__ void dump(T* __restrict__ out, idxT* __restrict__ out_idx) const { + for (int i = 0; i < max_arr_len_; ++i) { + idxT out_i = i * WARP_SIZE + lane_; + if (out_i < k_) { + out[out_i] = val_arr_[i]; + out_idx[out_i] = idx_arr_[i]; + } + } + } + + __device__ void dumpIdx(idxT* __restrict__ out_idx) const { + for (int i = 0; i < max_arr_len_; ++i) { + idxT out_i = i * WARP_SIZE + lane_; + if (out_i < k_) { + out_idx[out_i] = idx_arr_[i]; + } + } + } + + protected: + static constexpr int max_arr_len_ = capacity / WARP_SIZE; + + T val_arr_[max_arr_len_]; + idxT idx_arr_[max_arr_len_]; + + int const lane_; + idxT const k_; + T const dummy_; + +}; // end class WarpSort + +template +class WarpSelect : public WarpSort { + public: + __device__ WarpSelect(idxT k, T dummy) + : WarpSort(k, dummy), + k_th_(dummy), + k_th_lane_((k - 1) % WARP_SIZE) { + extern __shared__ char smem_buf[]; // extern __shared__ T smem_buf[]; + + int const num_of_warp = blockDim.x / WARP_SIZE; + int const warp_id = threadIdx.x / WARP_SIZE; + val_smem_ = reinterpret_cast(smem_buf); + val_smem_ += warp_id * WARP_SIZE; + idx_smem_ = reinterpret_cast( + smem_buf + + round_up_to_multiple_of<256>(num_of_warp * sizeof(T) * WARP_SIZE)); + idx_smem_ += warp_id * WARP_SIZE; + } + + __device__ void add(T const* in, idxT start, idxT end) { + idxT const end_for_fullwarp = + round_up_to_multiple_of(end - start) + start; + for (idxT i = start + lane_; i < end_for_fullwarp; i += WARP_SIZE) { + T val = (i < end) ? in[i] : dummy_; + add(val, i); + } + } + + __device__ void add(T val, idxT idx) { + bool do_add; + if constexpr (is_stable) { + do_add = is_better_than(val, k_th_, idx, k_th_idx_); + } else { + do_add = is_better_than(val, k_th_); + } + + uint32_t mask = __ballot_sync(FULL_WARP_MASK, do_add); + if (mask == 0) { + return; + } + + int pos = smem_buf_len_ + __popc(mask & ((0x1u << lane_) - 1)); + if (do_add && pos < WARP_SIZE) { + val_smem_[pos] = val; + idx_smem_[pos] = idx; + do_add = false; + } + smem_buf_len_ += __popc(mask); + if (smem_buf_len_ >= WARP_SIZE) { + __syncwarp(); + merge_buf_(val_smem_[lane_], idx_smem_[lane_]); + smem_buf_len_ -= WARP_SIZE; + } + if (do_add) { + pos -= WARP_SIZE; + val_smem_[pos] = val; + idx_smem_[pos] = idx; + } + __syncwarp(); + } + + __device__ void done() { + if (smem_buf_len_) { + T val = (lane_ < smem_buf_len_) ? val_smem_[lane_] : dummy_; + idxT idx = (lane_ < smem_buf_len_) ? idx_smem_[lane_] : 0; + merge_buf_(val, idx); + } + + // after done(), smem is used for merging results among warps + __syncthreads(); + } + + private: + __device__ void set_k_th_() { + k_th_ = __shfl_sync(FULL_WARP_MASK, val_arr_[max_arr_len_ - 1], k_th_lane_); + if constexpr (is_stable) { + k_th_idx_ = + __shfl_sync(FULL_WARP_MASK, idx_arr_[max_arr_len_ - 1], k_th_lane_); + } + } + + __device__ void merge_buf_(T val, idxT idx) { + BitonicSort::sort(&val, &idx); + + T& old = val_arr_[max_arr_len_ - 1]; + + bool is_better; + if constexpr (is_stable) { + is_better = + is_better_than(val, old, idx, idx_arr_[max_arr_len_ - 1]); + } else { + is_better = is_better_than(val, old); + } + + if (is_better) { + old = val; + idx_arr_[max_arr_len_ - 1] = idx; + } + + BitonicMerge::merge( + val_arr_, idx_arr_); + + set_k_th_(); + } + + using WarpSort::max_arr_len_; + using WarpSort::val_arr_; + using WarpSort::idx_arr_; + using WarpSort::lane_; + using WarpSort::k_; + using WarpSort::dummy_; + + T* val_smem_; + idxT* idx_smem_; + int smem_buf_len_ = 0; + + T k_th_; + idxT k_th_idx_; + int const k_th_lane_; +}; // end class WarpSelect +} // namespace warp_topk + +template +__device__ inline T_OUT cuda_cast(T_IN val) { + return val; +} + +template <> +__device__ inline float cuda_cast(__nv_bfloat16 val) { + return __bfloat162float(val); +} + +template +__device__ void topk_with_k2(T* output, T const* input, + cg::thread_block_tile<32> const& tile, + int32_t const lane_id, + int const num_experts_per_group) { + // Get the top2 per thread + T largest = -INFINITY; + T second_largest = -INFINITY; + + if (num_experts_per_group > WARP_SIZE) { + for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) { + T value = input[i]; + if (value > largest) { + second_largest = largest; + largest = value; + } else if (value > second_largest) { + second_largest = value; + } + } + } else { + for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) { + largest = input[i]; + } + } + + __syncwarp(); // Ensure all threads have valid data before reduction + // Get the top2 warpwise + T max1 = cg::reduce(tile, largest, cg::greater()); + + T max2 = max1; + bool equal_to_max1 = (max1 == largest); + + int count_max1 = __popc(__ballot_sync(FULL_WARP_MASK, equal_to_max1)); + + if (count_max1 == 1) { + largest = (largest == max1) ? second_largest : largest; + max2 = cg::reduce(tile, largest, cg::greater()); + } + + if (lane_id == 0) { + *output = max1 + max2; + } +} + +template +__global__ void topk_with_k2_kernel(T* output, T* input, + int64_t const num_tokens, + int64_t const num_cases, + int64_t const n_group, + int64_t const num_experts_per_group) { + int32_t warp_id = threadIdx.x / WARP_SIZE; + int32_t lane_id = threadIdx.x % WARP_SIZE; + + int32_t case_id = blockIdx.x * NUM_WARPS_PER_BLOCK + warp_id; + if (case_id < num_cases) { + input += case_id * num_experts_per_group; + output += case_id; + + cg::thread_block block = cg::this_thread_block(); + cg::thread_block_tile<32> tile = cg::tiled_partition<32>(block); + +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) + asm volatile("griddepcontrol.wait;"); +#endif + topk_with_k2(output, input, tile, lane_id, num_experts_per_group); + } +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) + asm volatile("griddepcontrol.launch_dependents;"); +#endif +} + +template +__global__ void group_idx_and_topk_idx_kernel( + T* scores, T const* group_scores, T* topk_values, IdxT* topk_indices, + T* scores_with_bias, int64_t const num_tokens, int64_t const n_group, + int64_t const topk_group, int64_t const topk, int64_t const num_experts, + int64_t const num_experts_per_group, bool renormalize, + double routed_scaling_factor) { + int32_t warp_id = threadIdx.x / WARP_SIZE; + int32_t lane_id = threadIdx.x % WARP_SIZE; + int32_t case_id = + blockIdx.x * NUM_WARPS_PER_BLOCK + warp_id; // one per token + scores_with_bias += case_id * num_experts; + scores += case_id * num_experts; + group_scores += case_id * n_group; + topk_values += case_id * topk; + topk_indices += case_id * topk; + + int32_t align_num_experts_per_group = + warp_topk::round_up_to_multiple_of(num_experts_per_group); + + cg::thread_block block = cg::this_thread_block(); + cg::thread_block_tile<32> tile = cg::tiled_partition<32>(block); + + extern __shared__ char smem_buf[]; // NOTE: reuse the shared memory here to + // store the target topk idx + int32_t* s_topk_idx = reinterpret_cast(smem_buf); + T* s_topk_value = + reinterpret_cast(s_topk_idx + NUM_WARPS_PER_BLOCK * topk) + + warp_id * topk; + s_topk_idx += warp_id * topk; + + T value = cuda::std::numeric_limits::min(); + T topk_group_value = cuda::std::numeric_limits::min(); + int32_t num_equalto_topkth_group; + +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) + asm volatile("griddepcontrol.wait;"); // I think all prolog can be put before + // acqbulk because it's ptr arithmetic +#endif + + if (case_id < num_tokens) { + // calculate group_idx + int32_t target_num_min = WARP_SIZE - n_group + topk_group; + if (lane_id < n_group && + (isfinite(cuda_cast( + group_scores[lane_id])))) // The check is necessary to avoid + // abnormal input + { + value = group_scores[lane_id]; + } + + int count_equal_to_top_value = WARP_SIZE - n_group; + int pre_count_equal_to_top_value = 0; + // Use loop to find the largset top_group + while (count_equal_to_top_value < target_num_min) { + __syncwarp(); // Ensure all threads have valid data before reduction + topk_group_value = cg::reduce(tile, value, cg::greater()); + if (value == topk_group_value) { + value = cuda::std::numeric_limits::min(); + } + pre_count_equal_to_top_value = count_equal_to_top_value; + count_equal_to_top_value = __popc(__ballot_sync( + FULL_WARP_MASK, (value == cuda::std::numeric_limits::min()))); + } + num_equalto_topkth_group = target_num_min - pre_count_equal_to_top_value; + } + __syncthreads(); + + warp_topk::WarpSelect + queue((int32_t)topk, -INFINITY); + + int count_equalto_topkth_group = 0; + bool if_proceed_next_topk = + (topk_group_value != cuda::std::numeric_limits::min()); + if (case_id < num_tokens && if_proceed_next_topk) { + for (int i_group = 0; i_group < n_group; i_group++) { + if ((group_scores[i_group] > topk_group_value) || + ((group_scores[i_group] == topk_group_value) && + (count_equalto_topkth_group < num_equalto_topkth_group))) { + int32_t offset = i_group * num_experts_per_group; + for (int32_t i = lane_id; i < align_num_experts_per_group; + i += WARP_SIZE) { + T candidates = + (i < num_experts_per_group) && isfinite(cuda_cast( + scores_with_bias[offset + i])) + ? scores_with_bias[offset + i] + : cuda::std::numeric_limits::min(); + queue.add(candidates, offset + i); + } + if (group_scores[i_group] == topk_group_value) { + count_equalto_topkth_group++; + } + } + } + queue.done(); + __syncwarp(); + // Get the topk_idx + queue.dumpIdx(s_topk_idx); + __syncwarp(); + } + + // Load the valid score value + // Calculate the summation + float topk_sum = 1e-20; + if (case_id < num_tokens && if_proceed_next_topk) { + for (int i = lane_id; + i < warp_topk::round_up_to_multiple_of(topk); + i += WARP_SIZE) { + T value = + i < topk + ? scores[s_topk_idx[i]] + : cuda_cast(0.0f); // Load the valid value of expert + if (i < topk) { + s_topk_value[i] = value; + } + topk_sum += reduce(tile, cuda_cast(value), cg::plus()); + } + } + + __syncthreads(); + + if (case_id < num_tokens) { + if (if_proceed_next_topk) { + for (int i = lane_id; i < topk; i += WARP_SIZE) { + float value; + if (renormalize) { + value = cuda_cast(s_topk_value[i]) / topk_sum * + routed_scaling_factor; + } else { + value = cuda_cast(s_topk_value[i]) * routed_scaling_factor; + } + topk_indices[i] = s_topk_idx[i]; + topk_values[i] = cuda_cast(value); + } + } else { + for (int i = lane_id; i < topk; i += WARP_SIZE) { + topk_indices[i] = i; + topk_values[i] = cuda_cast(1.0f / topk); + } + } + // Note: when if_proceed_next_topk==false, choose the first 8 experts as the + // default result. + } +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) + asm volatile("griddepcontrol.launch_dependents;"); +#endif +} + +template +void invokeNoAuxTc(T* scores, T* group_scores, T* topk_values, + IdxT* topk_indices, T* scores_with_bias, + int64_t const num_tokens, int64_t const num_experts, + int64_t const n_group, int64_t const topk_group, + int64_t const topk, bool const renormalize, + double const routed_scaling_factor, bool enable_pdl = false, + cudaStream_t const stream = 0) { + int64_t num_cases = num_tokens * n_group; + int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1; + auto* kernel_instance1 = &topk_with_k2_kernel; + cudaLaunchConfig_t config; + config.gridDim = topk_with_k2_num_blocks; + config.blockDim = BLOCK_SIZE; + config.dynamicSmemBytes = 0; + config.stream = stream; + cudaLaunchAttribute attrs[1]; + attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; + attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl; + config.numAttrs = 1; + config.attrs = attrs; + cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores_with_bias, + num_tokens, num_cases, n_group, num_experts / n_group); + + int64_t topk_with_k_group_num_blocks = + (num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1; + size_t dynamic_smem_in_bytes = + warp_topk::calc_smem_size_for_block_wide(NUM_WARPS_PER_BLOCK, + topk); + auto* kernel_instance2 = &group_idx_and_topk_idx_kernel; + config.gridDim = topk_with_k_group_num_blocks; + config.blockDim = BLOCK_SIZE; + config.dynamicSmemBytes = dynamic_smem_in_bytes; + config.stream = stream; + attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; + attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl; + config.numAttrs = 1; + config.attrs = attrs; + cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores, + topk_values, topk_indices, scores_with_bias, num_tokens, + n_group, topk_group, topk, num_experts, + num_experts / n_group, renormalize, routed_scaling_factor); +} + +#define INSTANTIATE_NOAUX_TC(T, IdxT) \ + template void invokeNoAuxTc( \ + T * scores, T * group_scores, T * topk_values, IdxT * topk_indices, \ + T * scores_with_bias, int64_t const num_tokens, \ + int64_t const num_experts, int64_t const n_group, \ + int64_t const topk_group, int64_t const topk, bool const renormalize, \ + double const routed_scaling_factor, bool enable_pdl, \ + cudaStream_t const stream); + +INSTANTIATE_NOAUX_TC(float, int32_t); +INSTANTIATE_NOAUX_TC(half, int32_t); +INSTANTIATE_NOAUX_TC(__nv_bfloat16, int32_t); +} // end namespace moe +} // namespace vllm + +std::tuple grouped_topk( + torch::Tensor const& scores, torch::Tensor const& scores_with_bias, + int64_t n_group, int64_t topk_group, int64_t topk, bool renormalize, + double routed_scaling_factor) { + auto data_type = scores_with_bias.scalar_type(); + auto input_size = scores_with_bias.sizes(); + int64_t num_tokens = input_size[0]; + int64_t num_experts = input_size[1]; + TORCH_CHECK(input_size.size() == 2, "scores_with_bias must be a 2D Tensor"); + TORCH_CHECK(num_experts % n_group == 0, + "num_experts should be divisible by n_group"); + TORCH_CHECK(n_group <= 32, + "n_group should be smaller than or equal to 32 for now"); + TORCH_CHECK(topk <= 32, "topk should be smaller than or equal to 32 for now"); + + torch::Tensor group_scores = torch::empty( + {num_tokens, n_group}, torch::dtype(data_type).device(torch::kCUDA)); + torch::Tensor topk_values = torch::empty( + {num_tokens, topk}, torch::dtype(data_type).device(torch::kCUDA)); + torch::Tensor topk_indices = torch::empty( + {num_tokens, topk}, torch::dtype(torch::kInt32).device(torch::kCUDA)); + + auto stream = c10::cuda::getCurrentCUDAStream(scores_with_bias.get_device()); + + switch (data_type) { + case torch::kFloat16: + // Handle Float16 + vllm::moe::invokeNoAuxTc( + reinterpret_cast(scores.mutable_data_ptr()), + reinterpret_cast(group_scores.mutable_data_ptr()), + reinterpret_cast(topk_values.mutable_data_ptr()), + reinterpret_cast(topk_indices.mutable_data_ptr()), + reinterpret_cast(scores_with_bias.data_ptr()), num_tokens, + num_experts, n_group, topk_group, topk, renormalize, + routed_scaling_factor, false, stream); + break; + case torch::kFloat32: + // Handle Float32 + vllm::moe::invokeNoAuxTc( + reinterpret_cast(scores.mutable_data_ptr()), + reinterpret_cast(group_scores.mutable_data_ptr()), + reinterpret_cast(topk_values.mutable_data_ptr()), + reinterpret_cast(topk_indices.mutable_data_ptr()), + reinterpret_cast(scores_with_bias.data_ptr()), num_tokens, + num_experts, n_group, topk_group, topk, renormalize, + routed_scaling_factor, false, stream); + break; + case torch::kBFloat16: + // Handle BFloat16 + vllm::moe::invokeNoAuxTc<__nv_bfloat16, int32_t>( + reinterpret_cast<__nv_bfloat16*>(scores.mutable_data_ptr()), + reinterpret_cast<__nv_bfloat16*>(group_scores.mutable_data_ptr()), + reinterpret_cast<__nv_bfloat16*>(topk_values.mutable_data_ptr()), + reinterpret_cast(topk_indices.mutable_data_ptr()), + reinterpret_cast<__nv_bfloat16*>(scores_with_bias.data_ptr()), + num_tokens, num_experts, n_group, topk_group, topk, renormalize, + routed_scaling_factor, false, stream); + break; + default: + // Handle other data types + throw std::invalid_argument( + "Invalid dtype, only supports float16, float32, and bfloat16"); + break; + } + return {topk_values, topk_indices}; +} diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h index 661730c96867e..92fc280b362b9 100644 --- a/csrc/moe/moe_ops.h +++ b/csrc/moe/moe_ops.h @@ -22,6 +22,11 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output, torch::Tensor num_tokens_post_pad, int64_t top_k, int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N, int64_t BLOCK_SIZE_K, int64_t bit); + +std::tuple grouped_topk( + torch::Tensor const& scores, torch::Tensor const& scores_with_bias, + int64_t n_group, int64_t topk_group, int64_t topk, bool renormalize, + double routed_scaling_factor); #endif bool moe_permute_unpermute_supported(); diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp index 7e49f68f62438..8f33d6cd666fa 100644 --- a/csrc/moe/torch_bindings.cpp +++ b/csrc/moe/torch_bindings.cpp @@ -78,6 +78,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { "output_tensor) -> ()"); m.impl("shuffle_rows", torch::kCUDA, &shuffle_rows); + // Apply grouped topk routing to select experts. + m.def( + "grouped_topk(Tensor scores, Tensor scores_with_bias, int n_group, int " + "topk_group, int topk, bool renormalize, float " + "routed_scaling_factor) -> (Tensor, Tensor)"); + m.impl("grouped_topk", torch::kCUDA, &grouped_topk); #endif } diff --git a/tests/kernels/moe/test_grouped_topk.py b/tests/kernels/moe/test_grouped_topk.py new file mode 100644 index 0000000000000..646e763194fd6 --- /dev/null +++ b/tests/kernels/moe/test_grouped_topk.py @@ -0,0 +1,76 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Tests for the MoE grouped topk kernel + +Run `pytest tests/kernels/moe/test_grouped_topk.py`. +""" +import pytest +import torch + +from vllm.model_executor.layers.fused_moe.fused_moe import (fused_grouped_topk, + grouped_topk) +from vllm.platforms import current_platform + + +@pytest.mark.skipif(not current_platform.is_cuda(), + reason="This test is skipped on non-CUDA platform.") +@pytest.mark.parametrize("n_token", [1, 33, 64]) +@pytest.mark.parametrize("n_hidden", [1024, 2048]) +@pytest.mark.parametrize("n_expert", [16]) +@pytest.mark.parametrize("topk", [2]) +@pytest.mark.parametrize("renormalize", [True, False]) +@pytest.mark.parametrize("num_expert_group", [8]) +@pytest.mark.parametrize("topk_group", [2]) +@pytest.mark.parametrize("scoring_func", ["softmax", "sigmoid"]) +@pytest.mark.parametrize("routed_scaling_factor", [1.0, 2.5]) +@pytest.mark.parametrize("dtype", + [torch.float16, torch.bfloat16, torch.float32]) +def test_grouped_topk(monkeypatch: pytest.MonkeyPatch, n_token: int, + n_hidden: int, n_expert: int, topk: int, + renormalize: bool, num_expert_group: int, + topk_group: int, scoring_func: str, + routed_scaling_factor: float, dtype: torch.dtype): + current_platform.seed_everything(0) + hidden_states = torch.randn((n_token, n_hidden), + dtype=dtype, + device="cuda") + gating_output = torch.randn((n_token, n_expert), + dtype=dtype, + device="cuda") + e_score_correction_bias = torch.randn((n_expert, ), + dtype=torch.float32, + device="cuda") + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_FUSED_MOE_GROUPED_TOPK", "0") + baseline_topk_weights, baseline_topk_ids = grouped_topk( + hidden_states=hidden_states, + gating_output=gating_output, + topk=topk, + renormalize=renormalize, + num_expert_group=num_expert_group, + topk_group=topk_group, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias) + + test_topk_weights, test_topk_ids = fused_grouped_topk( + hidden_states=hidden_states, + gating_output=gating_output, + topk=topk, + renormalize=renormalize, + num_expert_group=num_expert_group, + topk_group=topk_group, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias) + + if renormalize: + torch.testing.assert_close(baseline_topk_weights, + test_topk_weights, + atol=2e-2, + rtol=0) + torch.testing.assert_close(baseline_topk_ids, + test_topk_ids, + atol=0, + rtol=0) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 3e3b43ce2abe3..054dc9d985a4c 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1502,6 +1502,17 @@ def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor, gating_output) +def grouped_topk(scores: torch.Tensor, scores_with_bias: torch.Tensor, + num_expert_group: int, topk_group: int, topk: int, + renormalize: bool, routed_scaling_factor: float): + if not current_platform.is_cuda(): + raise NotImplementedError("The fused grouped_topk kernel is only " + "available on CUDA platforms") + return torch.ops._moe_C.grouped_topk(scores, scores_with_bias, + num_expert_group, topk_group, topk, + renormalize, routed_scaling_factor) + + def moe_wna16_marlin_gemm(input: torch.Tensor, output: Optional[torch.Tensor], b_qweight: torch.Tensor, b_bias: Optional[torch.Tensor], diff --git a/vllm/envs.py b/vllm/envs.py index 5d0e972f43ad0..1c9c4cdde8001 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -131,6 +131,7 @@ if TYPE_CHECKING: VLLM_USE_DEEP_GEMM: bool = False VLLM_USE_DEEP_GEMM_E8M0: bool = True VLLM_SKIP_DEEP_GEMM_WARMUP: bool = False + VLLM_USE_FUSED_MOE_GROUPED_TOPK: bool = True VLLM_USE_FLASHINFER_MOE_FP8: bool = False VLLM_USE_FLASHINFER_MOE_FP4: bool = False VLLM_FLASHINFER_MOE_BACKEND: str = "throughput" @@ -963,6 +964,10 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_SKIP_DEEP_GEMM_WARMUP": lambda: bool(int(os.getenv("VLLM_SKIP_DEEP_GEMM_WARMUP", "0"))), + # Whether to use fused grouped_topk used for MoE expert selection. + "VLLM_USE_FUSED_MOE_GROUPED_TOPK": + lambda: bool(int(os.getenv("VLLM_USE_FUSED_MOE_GROUPED_TOPK", "1"))), + # Allow use of FlashInfer MoE kernels for fused moe ops. "VLLM_USE_FLASHINFER_MOE_FP8": lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP8", "0"))), @@ -1229,6 +1234,7 @@ def compute_hash() -> str: "VLLM_DISABLED_KERNELS", "VLLM_USE_DEEP_GEMM", "VLLM_USE_TRTLLM_FP4_GEMM", + "VLLM_USE_FUSED_MOE_GROUPED_TOPK", "VLLM_USE_FLASHINFER_MOE_FP8", "VLLM_USE_FLASHINFER_MOE_FP4", "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 02b7b65f4a025..84dafcf00d821 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -949,8 +949,23 @@ def grouped_topk( num_expert_group: int = 0, topk_group: int = 0, scoring_func: str = "softmax", - e_score_correction_bias: Optional[torch.Tensor] = None + routed_scaling_factor: float = 1.0, + e_score_correction_bias: Optional[torch.Tensor] = None, ) -> tuple[torch.Tensor, torch.Tensor]: + if envs.VLLM_USE_FUSED_MOE_GROUPED_TOPK and \ + current_platform.is_cuda() and \ + num_expert_group <= 32 and topk <= 32 and \ + e_score_correction_bias is not None: + return fused_grouped_topk( + hidden_states=hidden_states, + gating_output=gating_output, + topk=topk, + renormalize=renormalize, + e_score_correction_bias=e_score_correction_bias, + num_expert_group=num_expert_group, + topk_group=topk_group, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor) assert hidden_states.size(0) == gating_output.size(0), ( "Number of tokens mismatch") @@ -996,9 +1011,38 @@ def grouped_topk( if renormalize: topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) + topk_weights = topk_weights * routed_scaling_factor return topk_weights.to(torch.float32), topk_ids.to(torch.int32) +def fused_grouped_topk( + hidden_states: torch.Tensor, + gating_output: torch.Tensor, + topk: int, + renormalize: bool, + e_score_correction_bias: torch.Tensor, + num_expert_group: int = 0, + topk_group: int = 0, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, +) -> tuple[torch.Tensor, torch.Tensor]: + assert hidden_states.size(0) == gating_output.size(0), ( + "Number of tokens mismatch") + + if scoring_func == "softmax": + scores = torch.softmax(gating_output, dim=-1) + elif scoring_func == "sigmoid": + scores = gating_output.sigmoid() + else: + raise ValueError(f"Unsupported scoring function: {scoring_func}") + + scores_with_bias = scores + e_score_correction_bias.unsqueeze(0) + topk_values, topk_indices = ops.grouped_topk( + scores, scores_with_bias.to(scores.dtype), num_expert_group, + topk_group, topk, renormalize, routed_scaling_factor) + return topk_values.to(torch.float32), topk_indices.to(torch.int32) + + def get_config_dtype_str( dtype: torch.dtype, use_int4_w4a16: Optional[bool] = False, From 9188ae7cb5e78e6ecf95f41b587d3b279c231609 Mon Sep 17 00:00:00 2001 From: Zhonghua Deng Date: Tue, 26 Aug 2025 03:57:08 +0800 Subject: [PATCH 003/112] [Bugfix][V1][P/D]Fix the issue where repeated requests for the same input produce abnormal outputs for P2pNcclConnector (#23403) Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_connector.py | 25 +++++++++++++--- .../kv_connector/v1/p2p/p2p_nccl_engine.py | 30 ++----------------- .../kv_connector/v1/p2p/tensor_memory_pool.py | 5 ++-- 3 files changed, 26 insertions(+), 34 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 32d0e43d71afe..25675d70fe225 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -245,16 +245,33 @@ class P2pNcclConnector(KVConnectorBase_V1): assert self.p2p_nccl_engine is not None + def extract_kv_from_layer( + layer: torch.Tensor, + slot_mapping: torch.Tensor, + ) -> torch.Tensor: + """Extract the KV cache from the layer. + + Assume the shape of the layer is (2, num_pages, page_size, xxx) + if MLA is not used, and (num_pages, page_size, xxx) otherwise. + """ + if isinstance(attn_metadata, MLACommonMetadata): + num_pages, page_size = layer.shape[0], layer.shape[1] + return layer.reshape(num_pages * page_size, -1)[slot_mapping, + ...] + num_pages, page_size = layer.shape[1], layer.shape[2] + return layer.reshape(2, num_pages * page_size, -1)[:, slot_mapping, + ...] + connector_metadata = self._get_connector_metadata() assert isinstance(connector_metadata, P2pNcclConnectorMetadata) for request in connector_metadata.requests: request_id = request.request_id ip, port = self.parse_request_id(request_id, True) remote_address = ip + ":" + str(port + self._rank) - self.p2p_nccl_engine.send_tensor( - request_id + "#" + layer_name, kv_layer, remote_address, - request.slot_mapping, - isinstance(attn_metadata, MLACommonMetadata)) + + kv_cache = extract_kv_from_layer(kv_layer, request.slot_mapping) + self.p2p_nccl_engine.send_tensor(request_id + "#" + layer_name, + kv_cache, remote_address) def wait_for_save(self): if self.is_producer: diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index b94f2296dcb36..dfd95548c4632 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -62,8 +62,6 @@ class SendQueueItem: tensor_id: str remote_address: str tensor: torch.Tensor - slot_mapping: torch.Tensor - is_mla: bool class P2pNcclEngine: @@ -202,8 +200,6 @@ class P2pNcclEngine: tensor_id: str, tensor: torch.Tensor, remote_address: typing.Optional[str] = None, - slot_mapping: torch.Tensor = None, - is_mla: bool = False, ) -> bool: if remote_address is None: with self.recv_store_cv: @@ -213,9 +209,7 @@ class P2pNcclEngine: item = SendQueueItem(tensor_id=tensor_id, remote_address=remote_address, - tensor=tensor, - slot_mapping=slot_mapping, - is_mla=is_mla) + tensor=tensor) if self.send_type == "PUT": return self.send_sync(item) @@ -433,9 +427,7 @@ class P2pNcclEngine: if item.remote_address not in self.socks: self.create_connect(item.remote_address) - with self.send_stream: - tensor = self.extract_kv_from_layer(item.is_mla, item.tensor, - item.slot_mapping) + tensor = item.tensor sock = self.socks[item.remote_address] comm, rank = self.comms[item.remote_address] @@ -548,21 +540,3 @@ class P2pNcclEngine: self._send_thread.join() if self._ping_thread is not None: self._ping_thread.join() - - @staticmethod - def extract_kv_from_layer( - is_mla: bool, - layer: torch.Tensor, - slot_mapping: torch.Tensor, - ) -> torch.Tensor: - """Extract the KV cache from the layer. - Assume the shape of the layer is (2, num_pages, page_size, xxx) - if MLA is not used, and (num_pages, page_size, xxx) otherwise. - """ - if is_mla: - num_pages, page_size = layer.shape[0], layer.shape[1] - return layer.reshape(num_pages * page_size, -1)[slot_mapping, ...] - - num_pages, page_size = layer.shape[1], layer.shape[2] - return layer.reshape(2, num_pages * page_size, -1)[:, slot_mapping, - ...] diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py index 02e3bc6274f60..b775276d4a846 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py @@ -99,8 +99,9 @@ class TensorMemoryPool: addr=self.base_address) self.free_lists[self.max_block_size][ initial_block.addr] = initial_block - logger.debug("TensorMemoryPool, base_address:", self.base_address, - self.base_address % self.max_block_size) + + logger.debug("TensorMemoryPool, base_address:%d, max_block_size:%d", + self.base_address, self.max_block_size) def allocate(self, size: int) -> int: """Allocates a memory block of at least the requested size. From 8a044754bd083671e4bb09a68b1edae9610dfccc Mon Sep 17 00:00:00 2001 From: Chaojun Zhang Date: Tue, 26 Aug 2025 04:09:26 +0800 Subject: [PATCH 004/112] [XPU] Delay BF16 check to worker init for spawn compatibility (#22979) Signed-off-by: chzhang --- vllm/platforms/cuda.py | 20 +++++++++++++++++++ vllm/platforms/interface.py | 7 +++++++ vllm/platforms/rocm.py | 20 +++++++++++++++++++ vllm/platforms/xpu.py | 37 +++++++++++------------------------- vllm/v1/worker/gpu_worker.py | 22 +-------------------- vllm/v1/worker/xpu_worker.py | 1 + 6 files changed, 60 insertions(+), 47 deletions(-) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 134ba36e5e735..c0e0fe35e4024 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -518,6 +518,26 @@ class CudaPlatformBase(Platform): supported = True return supported + @classmethod + def check_if_supports_dtype(cls, torch_dtype: torch.dtype): + if torch_dtype == torch.bfloat16: # noqa: SIM102 + if not cls.has_device_capability(80): + capability = cls.get_device_capability() + gpu_name = cls.get_device_name() + + if capability is None: + compute_str = "does not have a compute capability" + else: + version_str = capability.as_version_str() + compute_str = f"has compute capability {version_str}" + + raise ValueError( + "Bfloat16 is only supported on GPUs " + "with compute capability of at least 8.0. " + f"Your {gpu_name} GPU {compute_str}. " + "You can use float16 instead by explicitly setting the " + "`dtype` flag in CLI, for example: --dtype=half.") + # NVML utils # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`, diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 00bc555288e8e..f6c17de86d05a 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -572,6 +572,13 @@ class Platform: """ return False + @classmethod + def check_if_supports_dtype(cls, torch_dtype: torch.dtype): + """ + Check if the dtype is supported by the current platform. + """ + raise NotImplementedError + class UnspecifiedPlatform(Platform): _enum = PlatformEnum.UNSPECIFIED diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 323ec591c50a3..85b2fe2e480c8 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -462,3 +462,23 @@ class RocmPlatform(Platform): def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str, model_config: "ModelConfig") -> bool: return True + + @classmethod + def check_if_supports_dtype(cls, torch_dtype: torch.dtype): + if torch_dtype == torch.bfloat16: # noqa: SIM102 + if not cls.has_device_capability(80): + capability = cls.get_device_capability() + gpu_name = cls.get_device_name() + + if capability is None: + compute_str = "does not have a compute capability" + else: + version_str = capability.as_version_str() + compute_str = f"has compute capability {version_str}" + + raise ValueError( + "Bfloat16 is only supported on GPUs " + "with compute capability of at least 8.0. " + f"Your {gpu_name} GPU {compute_str}. " + "You can use float16 instead by explicitly setting the " + "`dtype` flag in CLI, for example: --dtype=half.") diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index af24437f649f4..235e5d8294e52 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -97,13 +97,6 @@ class XPUPlatform(Platform): from vllm.config import CompilationLevel vllm_config.compilation_config.level = CompilationLevel.NO_COMPILATION # noqa: E501 - # Instances created using VllmConfig() typically have model_config as - # None by default. The modification involves adding a check to prevent - # potential null exceptions check and update model config. - if model_config is not None and model_config.dtype == torch.bfloat16 \ - and not cls.device_support_bf16(): - model_config.dtype = torch.float16 - # lazy import to avoid circular import from vllm.config import CUDAGraphMode compilation_config = vllm_config.compilation_config @@ -162,30 +155,11 @@ class XPUPlatform(Platform): torch.xpu.reset_peak_memory_stats(device) return torch.xpu.max_memory_allocated(device) - @classmethod - def device_support_bf16(cls) -> bool: - device_name = cls.get_device_name().lower() - if cls.is_client_gpu_a770(): - logger.warning("Intel Arc A770 have bfloat16 accuracy known issue," - " fallback to float16") - return False - else: - logger.info( - "Device name %s supports bfloat16. Please file an issue " - "if you encounter any accuracy problems with bfloat16.", - device_name) - return True - @classmethod def is_data_center_gpu(cls) -> bool: device_name = cls.get_device_name().lower() return device_name.count("data center gpu") > 0 - @classmethod - def is_client_gpu_a770(cls) -> bool: - device_name = cls.get_device_name().lower() - return device_name.count("a770") > 0 - @classmethod def get_device_communicator_cls(cls) -> str: return "vllm.distributed.device_communicators.xpu_communicator.XpuCommunicator" # noqa @@ -197,3 +171,14 @@ class XPUPlatform(Platform): @classmethod def device_count(cls) -> int: return torch.xpu.device_count() + + @classmethod + def check_if_supports_dtype(cls, torch_dtype: torch.dtype): + if torch_dtype == torch.bfloat16: # noqa: SIM102 + device_name = cls.get_device_name().lower() + # client gpu a770 + if device_name.count("a770") > 0: + raise ValueError( + "Intel Arc A770 have bfloat16 accuracy known issue. " + "You can use float16 instead by explicitly setting the " + "`dtype` flag in CLI, for example: --dtype=half.") diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 1688b8b83e873..0dca45a759216 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -167,7 +167,7 @@ class Worker(WorkerBase): self.device = torch.device(f"cuda:{self.local_rank}") current_platform.set_device(self.device) - _check_if_gpu_supports_dtype(self.model_config.dtype) + current_platform.check_if_supports_dtype(self.model_config.dtype) gc.collect() torch.cuda.empty_cache() @@ -612,23 +612,3 @@ def init_worker_distributed_environment( parallel_config.pipeline_parallel_size) ensure_kv_transfer_initialized(vllm_config) - - -def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype): - # Check if the GPU supports the dtype. - if torch_dtype == torch.bfloat16: # noqa: SIM102 - if not current_platform.has_device_capability(80): - capability = current_platform.get_device_capability() - gpu_name = current_platform.get_device_name() - - if capability is None: - compute_str = "does not have a compute capability" - else: - version_str = capability.as_version_str() - compute_str = f"has compute capability {version_str}" - - raise ValueError( - "Bfloat16 is only supported on GPUs with compute capability " - f"of at least 8.0. Your {gpu_name} GPU {compute_str}. " - "You can use float16 instead by explicitly setting the " - "`dtype` flag in CLI, for example: --dtype=half.") diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py index 134d839252653..17288cda8eccf 100644 --- a/vllm/v1/worker/xpu_worker.py +++ b/vllm/v1/worker/xpu_worker.py @@ -145,6 +145,7 @@ class XPUWorker(Worker): ): self.device = torch.device(f"xpu:{self.local_rank}") current_platform.set_device(self.device) + current_platform.check_if_supports_dtype(self.model_config.dtype) torch.xpu.empty_cache() self.init_gpu_memory = torch.xpu.get_device_properties( self.local_rank).total_memory From c34c82b7fe5f62e771334bdafc0c4559856ce58f Mon Sep 17 00:00:00 2001 From: Pate Motter Date: Mon, 25 Aug 2025 14:29:16 -0700 Subject: [PATCH 005/112] [TPU][Bugfix] Fixes prompt_token_ids error in tpu tests. (#23574) Signed-off-by: Pate Motter --- .buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh | 2 +- .buildkite/scripts/hardware_ci/run-tpu-v1-test.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh index b571618f48c2b..1073a4ee30afa 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh @@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR" echo "--- Installing Python dependencies ---" python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ - && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \ + && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \ && python3 -m pip install --progress-bar off hf-transfer echo "--- Python dependencies installed ---" export VLLM_USE_V1=1 diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh index d55a786e41e8b..505664f3aecd0 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh @@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR" echo "--- Installing Python dependencies ---" python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ - && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \ + && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \ && python3 -m pip install --progress-bar off hf-transfer echo "--- Python dependencies installed ---" export VLLM_USE_V1=1 From 7b6a8372755dfd6b8b2449b24e2d9d8589ff0291 Mon Sep 17 00:00:00 2001 From: Terrence Zhao <32208165+Terrencezzj@users.noreply.github.com> Date: Mon, 25 Aug 2025 17:53:52 -0400 Subject: [PATCH 006/112] [Docs] Update Documentation of Cohere Command-A Models (#23584) Signed-off-by: Terrencezzj Signed-off-by: Abatom Co-authored-by: Zhonghua Deng --- docs/models/supported_models.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 8fb1019f2bdfb..4763f2281d323 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -332,7 +332,7 @@ th { | `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | | | | `MBartForConditionalGeneration` | mBART | `facebook/mbart-large-en-ro`, `facebook/mbart-large-50`, etc. | | | | | `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R, Command-A | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, `CohereLabs/c4ai-command-a-03-2025`, `CohereLabs/command-a-reasoning-08-2025`, etc. | ✅︎ | ✅︎ | ✅︎ | | `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ | | `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ | ✅︎ | | `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | | ✅︎ | ✅︎ | From efc88cf64a399f5459cd6256223e99672c13614d Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 25 Aug 2025 15:42:29 -0700 Subject: [PATCH 007/112] [Misc] Simplify FlashInfer attention metadata (#23585) Signed-off-by: Woosuk Kwon --- vllm/v1/attention/backends/flashinfer.py | 277 ++++++++++------------- 1 file changed, 114 insertions(+), 163 deletions(-) diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 50819bb2bb943..941d2a4d7f1ac 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -123,29 +123,9 @@ class FlashInferMetadata: num_actual_tokens: int # Number of tokens excluding padding. - # (batch_size + 1,). The cumulative subquery lengths of the sequences in - # the batch, used to index into subquery. E.g., if the subquery length - # is [4, 6], it is [0, 4, 10]. - qo_indptr_cpu: torch.Tensor - # An example for paged_kv_indices, paged_kv_indptr: - # request 1, page indices [0, 5, 8] - # request 2, page indices [1, 6, 7] - # request 3, page indices [3, 4] - # paged_kv_indices is a concatenation of page indices of all requests: - # [0, 5, 8, 1, 6, 7, 3, 4] - # paged_kv_indptr is used to index into paged_kv_indices: - # [0, 3, 6, 8] - # The indptr of the paged kv cache, shape: [batch_size + 1] (CPU for plan) - paged_kv_indptr_cpu: torch.Tensor - # The page indices of the paged kv cache (on device for plan) - paged_kv_indices: torch.Tensor - # The number of entries in the last page of each request in - # the paged kv cache, shape: [batch_size] (CPU for plan) - paged_kv_last_page_len_cpu: torch.Tensor # The data type of the query q_data_type: torch.dtype - seq_lens_cpu: torch.Tensor slot_mapping: torch.Tensor # For flashinfer trtllm batch decode @@ -164,10 +144,6 @@ class FlashInferMetadata: # For cascade attention (CPU for planning). use_cascade: bool - shared_qo_indptr_cpu: Optional[torch.Tensor] = None - shared_kv_page_indptr_cpu: Optional[torch.Tensor] = None - shared_kv_page_indices_cpu: Optional[torch.Tensor] = None - shared_kv_last_page_len_cpu: Optional[torch.Tensor] = None prefill_wrapper: Optional[BatchPrefillWithPagedKVCacheWrapper] = None decode_wrapper: Optional[BatchDecodeWithPagedKVCacheWrapper] = None @@ -327,134 +303,6 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): 2, self._get_workspace_buffer(), get_kv_cache_layout()) return self._cascade_wrapper - def _plan(self, attn_metadata: FlashInferMetadata): - if attn_metadata.use_cascade: - attn_metadata.cascade_wrapper = self._get_cascade_wrapper() - attn_metadata.cascade_wrapper.plan( - [ - attn_metadata.shared_qo_indptr_cpu, - attn_metadata.qo_indptr_cpu - ], - [ - attn_metadata.shared_kv_page_indptr_cpu, - attn_metadata.paged_kv_indptr_cpu - ], - [ - attn_metadata.shared_kv_page_indices_cpu, - attn_metadata.paged_kv_indices - ], - [ - attn_metadata.shared_kv_last_page_len_cpu, - attn_metadata.paged_kv_last_page_len_cpu - ], - self.num_qo_heads, - self.num_kv_heads, - self.head_dim, - self.page_size, - causal=True, - sm_scale=self.global_hyperparameters.sm_scale, - window_left=self.global_hyperparameters.window_left, - logits_soft_cap=self.global_hyperparameters.logits_soft_cap, - q_data_type=self.q_data_type, - kv_data_type=self.kv_cache_dtype, - ) - else: - # Regular attention (common case). - # Decodes are at the front and prefills are at the back, - # according to reorder_batch() - num_prefills = attn_metadata.num_prefills - num_decodes = attn_metadata.num_decodes - if num_prefills > 0: - # Decodes are first so prefills start after the last decode - prefill_start = num_decodes - attn_metadata.prefill_wrapper = self._get_prefill_wrapper() - assert attn_metadata.qo_indptr_cpu[prefill_start:].shape[ - 0] == num_prefills + 1 - assert attn_metadata.paged_kv_indptr_cpu[prefill_start:].shape[ - 0] == num_prefills + 1 - assert attn_metadata.paged_kv_last_page_len_cpu[ - prefill_start:].shape[0] == num_prefills - # Since prefill_wrapper.run() will be called with - # query[num_decode_tokens:] we need to adjust the qo_indptr - # to be relative to the start of the prefill queries. - qo_indptr_cpu = attn_metadata.qo_indptr_cpu[ - prefill_start:] - attn_metadata.qo_indptr_cpu[prefill_start] - paged_kv_indptr_cpu = attn_metadata.paged_kv_indptr_cpu[ - prefill_start:] - if not attn_metadata.prefill_use_trtllm: - attn_metadata.prefill_wrapper.plan( - qo_indptr_cpu, - paged_kv_indptr_cpu, - attn_metadata.paged_kv_indices, - attn_metadata. - paged_kv_last_page_len_cpu[prefill_start:], - self.num_qo_heads, - self.num_kv_heads, - self.head_dim, - self.page_size, - causal=True, - sm_scale=self.global_hyperparameters.sm_scale, - window_left=self.global_hyperparameters.window_left, - logits_soft_cap=self.global_hyperparameters. - logits_soft_cap, - q_data_type=self.q_data_type, - kv_data_type=self.kv_cache_dtype, - ) - else: - attn_metadata.qo_indptr_gpu = qo_indptr_cpu.to(self.device) - attn_metadata.paged_kv_indptr_gpu = paged_kv_indptr_cpu.to( - self.device) - - if num_decodes > 0: - pure_decode = num_prefills == 0 - # possible required padding for cudagraph replay - use_cudagraph = (self.enable_cuda_graph and pure_decode and - num_decodes <= self._decode_cudagraph_max_bs) - if use_cudagraph: - num_input_tokens = ( - self.vllm_config.pad_for_cudagraph(num_decodes)) - # Carefully fulfill the padding region with reasonable value - # on cpu. - # Make sure paged_kv_indptr_cpu is not decreasing - self.paged_kv_indptr_cpu[1 + num_decodes:1 + - num_input_tokens].fill_( - attn_metadata. - paged_kv_indptr_cpu[-1]) - # Fill the remaining paged_kv_last_page_len_cpu with 1. - # This is because flashinfer treats 0 as a full page - # instead of empty. - self.paged_kv_last_page_len_cpu[ - num_decodes:num_input_tokens].fill_(1) - - else: - num_input_tokens = num_decodes - - attn_metadata.decode_wrapper = self._get_decode_wrapper( - num_input_tokens, use_cudagraph) - if not attn_metadata.decode_use_trtllm: - # Use the persistent buffer with padding length, - # instead of the same address but chunked version - # in atten_metadata when using cudagraph. - fast_plan_decode( - attn_metadata.decode_wrapper, - self.paged_kv_indptr_cpu[:num_input_tokens + 1], - attn_metadata.paged_kv_indices, - self.paged_kv_last_page_len_cpu[:num_input_tokens], - attn_metadata.seq_lens_cpu[:num_input_tokens], - self.num_qo_heads, - self.num_kv_heads, - self.head_dim, - self.page_size, - # Disable flashinfer's pos encoding and use vllm's rope. - pos_encoding_mode="NONE", - sm_scale=self.global_hyperparameters.sm_scale, - window_left=self.global_hyperparameters.window_left, - logits_soft_cap=self.global_hyperparameters. - logits_soft_cap, - q_data_type=self.q_data_type, - kv_data_type=self.kv_cache_dtype, - ) - def build(self, common_prefix_len: int, common_attn_metadata: CommonAttentionMetadata, @@ -548,13 +396,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): attn_metadata = FlashInferMetadata( num_actual_tokens=num_actual_tokens, - qo_indptr_cpu=common_attn_metadata.query_start_loc_cpu, - paged_kv_indptr_cpu=self.paged_kv_indptr_cpu[:1 + num_reqs], - paged_kv_indices=paged_kv_indices, - paged_kv_last_page_len_cpu=self. - paged_kv_last_page_len_cpu[:num_reqs], q_data_type=self.q_data_type, - seq_lens_cpu=seq_lens_cpu, slot_mapping=common_attn_metadata.slot_mapping, max_q_len=max_q_len, max_seq_len=max_seq_len, @@ -567,14 +409,123 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): num_prefills=num_prefills, num_prefill_tokens=num_prefill_tokens, use_cascade=use_cascade, - shared_qo_indptr_cpu=shared_qo_indptr_cpu, - shared_kv_page_indptr_cpu=shared_kv_page_indptr_cpu, - shared_kv_page_indices_cpu=shared_kv_page_indices_cpu, - shared_kv_last_page_len_cpu=shared_kv_last_page_len_cpu, ) - self._plan(attn_metadata) + qo_indptr_cpu = common_attn_metadata.query_start_loc_cpu + paged_kv_indptr_cpu = self.paged_kv_indptr_cpu[:1 + num_reqs] + paged_kv_last_page_len_cpu = self.paged_kv_last_page_len_cpu[:num_reqs] + if attn_metadata.use_cascade: + attn_metadata.cascade_wrapper = self._get_cascade_wrapper() + attn_metadata.cascade_wrapper.plan( + [shared_qo_indptr_cpu, qo_indptr_cpu], + [shared_kv_page_indptr_cpu, paged_kv_indptr_cpu], + [shared_kv_page_indices_cpu, paged_kv_indices], + [shared_kv_last_page_len_cpu, paged_kv_last_page_len_cpu], + self.num_qo_heads, + self.num_kv_heads, + self.head_dim, + self.page_size, + causal=True, + sm_scale=self.global_hyperparameters.sm_scale, + window_left=self.global_hyperparameters.window_left, + logits_soft_cap=self.global_hyperparameters.logits_soft_cap, + q_data_type=self.q_data_type, + kv_data_type=self.kv_cache_dtype, + ) + else: + # Regular attention (common case). + # Decodes are at the front and prefills are at the back, + # according to reorder_batch() + num_prefills = attn_metadata.num_prefills + num_decodes = attn_metadata.num_decodes + if num_prefills > 0: + # Decodes are first so prefills start after the last decode + prefill_start = num_decodes + attn_metadata.prefill_wrapper = self._get_prefill_wrapper() + assert qo_indptr_cpu[prefill_start:].shape[ + 0] == num_prefills + 1 + assert paged_kv_indptr_cpu[prefill_start:].shape[ + 0] == num_prefills + 1 + assert paged_kv_last_page_len_cpu[prefill_start:].shape[ + 0] == num_prefills + # Since prefill_wrapper.run() will be called with + # query[num_decode_tokens:] we need to adjust the qo_indptr + # to be relative to the start of the prefill queries. + qo_indptr_cpu = qo_indptr_cpu[prefill_start:] - qo_indptr_cpu[ + prefill_start] + paged_kv_indptr_cpu = paged_kv_indptr_cpu[prefill_start:] + if not attn_metadata.prefill_use_trtllm: + attn_metadata.prefill_wrapper.plan( + qo_indptr_cpu, + paged_kv_indptr_cpu, + paged_kv_indices, + paged_kv_last_page_len_cpu[prefill_start:], + self.num_qo_heads, + self.num_kv_heads, + self.head_dim, + self.page_size, + causal=True, + sm_scale=self.global_hyperparameters.sm_scale, + window_left=self.global_hyperparameters.window_left, + logits_soft_cap=self.global_hyperparameters. + logits_soft_cap, + q_data_type=self.q_data_type, + kv_data_type=self.kv_cache_dtype, + ) + else: + attn_metadata.qo_indptr_gpu = qo_indptr_cpu.to(self.device) + attn_metadata.paged_kv_indptr_gpu = paged_kv_indptr_cpu.to( + self.device) + + if num_decodes > 0: + pure_decode = num_prefills == 0 + # possible required padding for cudagraph replay + use_cudagraph = (self.enable_cuda_graph and pure_decode and + num_decodes <= self._decode_cudagraph_max_bs) + if use_cudagraph: + num_input_tokens = ( + self.vllm_config.pad_for_cudagraph(num_decodes)) + # Carefully fulfill the padding region with reasonable value + # on cpu. + # Make sure paged_kv_indptr_cpu is not decreasing + self.paged_kv_indptr_cpu[1 + num_decodes:1 + + num_input_tokens].fill_( + paged_kv_indptr_cpu[-1]) + # Fill the remaining paged_kv_last_page_len_cpu with 1. + # This is because flashinfer treats 0 as a full page + # instead of empty. + self.paged_kv_last_page_len_cpu[ + num_decodes:num_input_tokens].fill_(1) + + else: + num_input_tokens = num_decodes + + attn_metadata.decode_wrapper = self._get_decode_wrapper( + num_input_tokens, use_cudagraph) + if not attn_metadata.decode_use_trtllm: + # Use the persistent buffer with padding length, + # instead of the same address but chunked version + # in atten_metadata when using cudagraph. + fast_plan_decode( + attn_metadata.decode_wrapper, + self.paged_kv_indptr_cpu[:num_input_tokens + 1], + paged_kv_indices, + self.paged_kv_last_page_len_cpu[:num_input_tokens], + seq_lens_cpu[:num_input_tokens], + self.num_qo_heads, + self.num_kv_heads, + self.head_dim, + self.page_size, + # Disable flashinfer's pos encoding and use vllm's rope. + pos_encoding_mode="NONE", + sm_scale=self.global_hyperparameters.sm_scale, + window_left=self.global_hyperparameters.window_left, + logits_soft_cap=self.global_hyperparameters. + logits_soft_cap, + q_data_type=self.q_data_type, + kv_data_type=self.kv_cache_dtype, + ) return attn_metadata def build_for_cudagraph_capture( From 2a97ffc33de097f267f217132ced42f4714b7de5 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Mon, 25 Aug 2025 16:44:51 -0700 Subject: [PATCH 008/112] [Misc] Add release note draft to PR template (#23598) Signed-off-by: simon-mo --- .github/PULL_REQUEST_TEMPLATE.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 1b30c1292df85..8043df65d5585 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -7,8 +7,6 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTT ## Test Result -## (Optional) Documentation Update - ---
Essential Elements of an Effective PR Description Checklist @@ -17,6 +15,7 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTT - [ ] The test plan, such as providing test command. - [ ] The test results, such as pasting the results comparison before and after, or e2e results - [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model. +- [ ] (Optional) Release notes update. If your change is user facing, please update the release notes draft in the [Google Doc](https://docs.google.com/document/d/1YyVqrgX4gHTtrstbq8oWUImOyPCKSGnJ7xtTpmXzlRs/edit?tab=t.0).
**BEFORE SUBMITTING, PLEASE READ ** (anything written below this line will be removed by GitHub Actions) From 906e461ed6ddccd3cc7b68fa72048d2d3fcbd72c Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Mon, 25 Aug 2025 21:29:00 -0400 Subject: [PATCH 009/112] [CI Fix] Pin deepep and pplx tags in tools/ep_kernels/, gate multigpu tests (#23568) Signed-off-by: mgoin --- .buildkite/test-pipeline.yaml | 1 + tests/distributed/test_comm_ops.py | 12 +++++------- tests/kernels/moe/test_deepep_deepgemm_moe.py | 3 +++ tests/kernels/moe/test_deepep_moe.py | 3 +++ .../moe/test_modular_kernel_combinations.py | 2 ++ tests/kernels/moe/test_pplx_cutlass_moe.py | 2 ++ tests/kernels/moe/test_pplx_moe.py | 5 +++++ tests/utils.py | 9 ++++++--- tools/ep_kernels/install_python_libraries.sh | 15 +++++++++++++-- 9 files changed, 40 insertions(+), 12 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 20f3ce1adb46d..1ccfa93c571ce 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -390,6 +390,7 @@ steps: - csrc/moe/ - tests/kernels/moe - vllm/model_executor/layers/fused_moe/ + - vllm/distributed/device_communicators/ commands: - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT parallelism: 2 diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py index e2cb579e22dc4..8d84cc2d0ffe6 100644 --- a/tests/distributed/test_comm_ops.py +++ b/tests/distributed/test_comm_ops.py @@ -18,7 +18,8 @@ from vllm.distributed import (broadcast_tensor_dict, get_pp_group, tensor_model_parallel_all_reduce, tensor_model_parallel_reduce_scatter) -from ..utils import init_test_distributed_environment, multi_process_parallel +from ..utils import (init_test_distributed_environment, multi_gpu_test, + multi_process_parallel) @ray.remote(num_gpus=1, max_calls=1) @@ -226,8 +227,7 @@ def send_recv_test_worker( torch.testing.assert_close(test_tensor, recv_tensor) -@pytest.mark.skipif(torch.cuda.device_count() < 2, - reason="Need at least 2 GPUs to run the test.") +@multi_gpu_test(num_gpus=2) @pytest.mark.parametrize("tp_size", [2]) @pytest.mark.parametrize("test_target", [ all_reduce_test_worker, all_gather_test_worker, @@ -241,8 +241,7 @@ def test_multi_process_tensor_parallel( multi_process_parallel(monkeypatch, tp_size, 1, test_target) -@pytest.mark.skipif(torch.cuda.device_count() < 2, - reason="Need at least 2 GPUs to run the test.") +@multi_gpu_test(num_gpus=2) @pytest.mark.parametrize("pp_size", [2]) @pytest.mark.parametrize( "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker]) @@ -254,8 +253,7 @@ def test_multi_process_pipeline_parallel( multi_process_parallel(monkeypatch, 1, pp_size, test_target) -@pytest.mark.skipif(torch.cuda.device_count() < 4, - reason="Need at least 4 GPUs to run the test.") +@multi_gpu_test(num_gpus=4) @pytest.mark.parametrize("tp_size", [2]) @pytest.mark.parametrize("pp_size", [2]) @pytest.mark.parametrize("test_target", [ diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py index 6f95581a5e60d..1e922be47f2b4 100644 --- a/tests/kernels/moe/test_deepep_deepgemm_moe.py +++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py @@ -23,6 +23,7 @@ from vllm.utils import has_deep_ep, has_deep_gemm from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_e8m0_used, is_deep_gemm_supported) +from ...utils import multi_gpu_test from .parallel_utils import ProcessGroupInfo, parallel_launch from .utils import make_test_weights @@ -370,6 +371,7 @@ NUM_EXPERTS = [32] @pytest.mark.parametrize("num_experts", NUM_EXPERTS) @pytest.mark.parametrize("topk", TOPKS) @pytest.mark.parametrize("world_dp_size", [(2, 1)]) +@multi_gpu_test(num_gpus=2) @requires_deep_ep @requires_deep_gemm @pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(), @@ -427,6 +429,7 @@ USE_FP8_DISPATCH = [False] @pytest.mark.parametrize("use_fp8_dispatch", USE_FP8_DISPATCH) @pytest.mark.parametrize("block_size", [[128, 128]]) @pytest.mark.parametrize("world_dp_size", [(2, 1)]) +@multi_gpu_test(num_gpus=2) @requires_deep_ep @requires_deep_gemm @pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(), diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py index 43804c410b6c2..6a53af68cd53a 100644 --- a/tests/kernels/moe/test_deepep_moe.py +++ b/tests/kernels/moe/test_deepep_moe.py @@ -24,6 +24,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import ( from vllm.platforms import current_platform from vllm.utils import has_deep_ep +from ...utils import multi_gpu_test from .parallel_utils import ProcessGroupInfo, parallel_launch if has_deep_ep(): @@ -411,6 +412,7 @@ DTYPES = [torch.bfloat16, torch.float8_e4m3fn] @pytest.mark.parametrize("topk", [6]) @pytest.mark.parametrize("world_dp_size", [(2, 1)]) @pytest.mark.parametrize("per_act_token_quant", [False, True]) +@multi_gpu_test(num_gpus=2) @requires_deep_ep def test_deep_ep_moe( dtype: torch.dtype, @@ -459,6 +461,7 @@ USE_FP8_DISPATCH = [True, False] @pytest.mark.parametrize("topk", [6]) @pytest.mark.parametrize("world_dp_size", [(2, 1)]) @pytest.mark.parametrize("use_fp8_dispatch", USE_FP8_DISPATCH) +@multi_gpu_test(num_gpus=2) @requires_deep_ep def test_low_latency_deep_ep_moe(dtype: torch.dtype, mnk: tuple[int, int, int], num_experts: int, topk: int, diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py index d45982384eb3b..6112183be5475 100644 --- a/tests/kernels/moe/test_modular_kernel_combinations.py +++ b/tests/kernels/moe/test_modular_kernel_combinations.py @@ -16,6 +16,7 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe +from ...utils import multi_gpu_test from .modular_kernel_tools.common import (Config, RankTensors, WeightTensors, reference_moe_impl, run_modular_kernel) @@ -162,6 +163,7 @@ def is_nyi_config(config: Config) -> bool: product(MK_MULTI_GPU_PREPARE_FINALIZE_TYPES, MK_FUSED_EXPERT_TYPES)) @pytest.mark.parametrize("fused_moe_chunk_size", FUSED_MOE_CHUNK_SIZEs) @pytest.mark.parametrize("world_size", [2]) +@multi_gpu_test(num_gpus=2) @meets_multi_gpu_requirements def test_modular_kernel_combinations_multigpu( k: int, n: int, e: int, dtype: torch.dtype, diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py index 98908f2714707..9e78f4d6e4da0 100644 --- a/tests/kernels/moe/test_pplx_cutlass_moe.py +++ b/tests/kernels/moe/test_pplx_cutlass_moe.py @@ -17,6 +17,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import ( from vllm.platforms import current_platform from vllm.utils import cdiv +from ...utils import multi_gpu_test from .parallel_utils import ProcessGroupInfo, parallel_launch try: @@ -247,6 +248,7 @@ def _pplx_moe( @pytest.mark.parametrize("per_out_ch", [True, False]) @pytest.mark.parametrize("world_dp_size", [[2, 1]]) #, [4, 2]]) @pytest.mark.parametrize("use_internode", [False]) +@multi_gpu_test(num_gpus=2) @pytest.mark.skipif( (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))( current_platform.get_device_capability()), diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py index c2064de97358f..3f36d7ada2e94 100644 --- a/tests/kernels/moe/test_pplx_moe.py +++ b/tests/kernels/moe/test_pplx_moe.py @@ -37,6 +37,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( from vllm.platforms import current_platform from vllm.utils import round_up +from ...utils import multi_gpu_test from .parallel_utils import ProcessGroupInfo, parallel_launch requires_pplx = pytest.mark.skipif( @@ -452,6 +453,7 @@ def _pplx_prepare_finalize( @pytest.mark.parametrize("use_internode", [False]) @pytest.mark.optional @requires_pplx +@multi_gpu_test(num_gpus=2) def test_pplx_prepare_finalize_slow( mnk: tuple[int, int, int], e: int, @@ -740,6 +742,7 @@ def _pplx_moe( @pytest.mark.parametrize("use_internode", [False]) @pytest.mark.optional @requires_pplx +@multi_gpu_test(num_gpus=2) def test_pplx_moe_slow( mnk: tuple[int, int, int], e: int, @@ -880,6 +883,7 @@ def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool, @pytest.mark.parametrize("world_dp_size", [[2, 1]]) @pytest.mark.parametrize("use_internode", [False]) @requires_pplx +@multi_gpu_test(num_gpus=2) def test_pplx_prepare_finalize( world_dp_size: tuple[int, int], use_internode: bool, @@ -893,6 +897,7 @@ def test_pplx_prepare_finalize( @pytest.mark.parametrize("world_dp_size", [[2, 1]]) @pytest.mark.parametrize("use_internode", [False]) @requires_pplx +@multi_gpu_test(num_gpus=2) def test_pplx_moe( world_dp_size: tuple[int, int], use_internode: bool, diff --git a/tests/utils.py b/tests/utils.py index 4dba5494665a3..9d2073f3c1036 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -696,9 +696,12 @@ def multi_process_parallel( os.environ["RAY_RUNTIME_ENV_IGNORE_GITIGNORE"] = "1" ray.init( runtime_env={ - "working_dir": VLLM_PATH, - "excludes": - ["build", ".git", "cmake-build-*", "shellcheck", "dist"] + "working_dir": + VLLM_PATH, + "excludes": [ + "build", ".git", "cmake-build-*", "shellcheck", "dist", + "ep_kernels_workspace" + ] }) distributed_init_port = get_open_port() diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh index e163c83e8b513..59bfe69dc0dd6 100644 --- a/tools/ep_kernels/install_python_libraries.sh +++ b/tools/ep_kernels/install_python_libraries.sh @@ -77,6 +77,7 @@ clone_repo() { local repo_url=$1 local dir_name=$2 local key_file=$3 + local commit_hash=$4 if [ -d "$dir_name" ]; then # Check if directory has uncommitted changes (dirty) @@ -87,17 +88,27 @@ clone_repo() { echo "$dir_name directory exists but clone appears incomplete, cleaning up and re-cloning" rm -rf "$dir_name" git clone "$repo_url" + if [ -n "$commit_hash" ]; then + cd "$dir_name" + git checkout "$commit_hash" + cd .. + fi else echo "$dir_name directory exists and appears complete; manually update if needed" fi else git clone "$repo_url" + if [ -n "$commit_hash" ]; then + cd "$dir_name" + git checkout "$commit_hash" + cd .. + fi fi } # build and install pplx, require pytorch installed pushd $WORKSPACE -clone_repo "https://github.com/ppl-ai/pplx-kernels" "pplx-kernels" "setup.py" +clone_repo "https://github.com/ppl-ai/pplx-kernels" "pplx-kernels" "setup.py" "c336faf" cd pplx-kernels # see https://github.com/pypa/pip/issues/9955#issuecomment-838065925 # PIP_NO_BUILD_ISOLATION=0 disables build isolation @@ -106,7 +117,7 @@ popd # build and install deepep, require pytorch installed pushd $WORKSPACE -clone_repo "https://github.com/deepseek-ai/DeepEP" "DeepEP" "setup.py" +clone_repo "https://github.com/deepseek-ai/DeepEP" "DeepEP" "setup.py" "e3908bf" cd DeepEP export NVSHMEM_DIR=$WORKSPACE/nvshmem_install PIP_NO_BUILD_ISOLATION=0 pip install -vvv -e . From ae067888d6803b0fe0a2201ae9b464a848a0de01 Mon Sep 17 00:00:00 2001 From: weiliang Date: Tue, 26 Aug 2025 09:30:44 +0800 Subject: [PATCH 010/112] Update Flashinfer to 0.2.14.post1 (#23537) Signed-off-by: Siyuan Fu Signed-off-by: siyuanf Signed-off-by: Weiliang Liu Signed-off-by: Michael Goin Co-authored-by: Siyuan Fu Co-authored-by: Michael Goin Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- docker/Dockerfile | 2 +- setup.py | 2 +- vllm/compilation/collective_fusion.py | 3 ++- vllm/model_executor/layers/quantization/mxfp4.py | 7 ++++++- vllm/v1/worker/gpu_worker.py | 7 ++++--- 5 files changed, 14 insertions(+), 7 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 839ac501dbaf0..2e272cbca8417 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -373,7 +373,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist # Install FlashInfer from source ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git" # Keep this in sync with "flashinfer" extra in setup.py -ARG FLASHINFER_GIT_REF="v0.2.12" +ARG FLASHINFER_GIT_REF="v0.2.14.post1" # Flag to control whether to compile FlashInfer AOT kernels # Set to "true" to enable AOT compilation: # docker build --build-arg FLASHINFER_AOT_COMPILE=true ... diff --git a/setup.py b/setup.py index ca6e0a8592cc2..ffe8ec4e79af7 100644 --- a/setup.py +++ b/setup.py @@ -694,7 +694,7 @@ setup( "mistral_common[audio]"], # Required for audio processing "video": [], # Kept for backwards compatibility # FlashInfer should be updated together with the Dockerfile - "flashinfer": ["flashinfer-python==0.2.12"], + "flashinfer": ["flashinfer-python==0.2.14.post1"], # Optional deps for AMD FP4 quantization support "petit-kernel": ["petit-kernel"], }, diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py index 6ae50245ed3a8..c44ac8e0aa7ea 100644 --- a/vllm/compilation/collective_fusion.py +++ b/vllm/compilation/collective_fusion.py @@ -465,7 +465,8 @@ if flashinfer_comm is not None: quant_out=quant_out, scale_out=scale_out, # in vllm we only support swizzled layout - layout_code=flashinfer_comm.FP4QuantizationSFLayout.SWIZZLED, + layout_code=flashinfer_comm.QuantizationSFLayout. + SWIZZLED_128x4, scale_factor=scale_factor, ) else: diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 6a190ebbc063e..df96e5d8c413e 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -6,6 +6,7 @@ import torch from torch.nn.parameter import Parameter from vllm import envs +from vllm.config import get_current_vllm_config from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig, FusedMoEMethodBase) @@ -113,6 +114,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): self.topk_indices_dtype = None self.moe = moe self.use_marlin = self._should_use_marlin() + self.max_capture_size = get_current_vllm_config( + ).compilation_config.max_capture_size if current_platform.is_device_capability(100) and not has_flashinfer(): logger.warning_once( @@ -520,7 +523,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): x_scale = None else: x_quant, x_scale = mxfp8_quantize(x, False) # to mxfp8 - x_scale = x_scale.view(torch.float8_e4m3fn).reshape(-1) + x_scale = x_scale.view(torch.float8_e4m3fn).reshape( + *x.shape[:-1], -1) trtllm_gen_output = trtllm_fp4_block_scale_moe( router_logits.to(torch.bfloat16), None, # routing_bias @@ -549,6 +553,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): self._get_tile_tokens_dim(x, top_k), 1 if renormalize else 0, # routing_method_type, renormalize True, # do finalize + tune_max_num_tokens=self.max_capture_size, )[0] return trtllm_gen_output else: diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 0dca45a759216..c252193313344 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -310,6 +310,10 @@ class Worker(WorkerBase): logger.info("Compile and warming up model for size %d", size) self.model_runner._dummy_run(size, skip_eplb=True) + # Warmup and tune the kernels used during model execution before + # cuda graph capture. + kernel_warmup(self) + if not self.model_config.enforce_eager: self.model_runner.capture_model() @@ -334,9 +338,6 @@ class Worker(WorkerBase): self.model_runner._dummy_sampler_run( hidden_states=last_hidden_states) - # Warmup kernels used during model execution - kernel_warmup(self) - # Reset the seed to ensure that the random state is not affected by # the model initialization and profiling. set_random_seed(self.model_config.seed) From 56dcf4e7e965e34043acf20ca4e4aceda21d41ec Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Mon, 25 Aug 2025 21:41:21 -0400 Subject: [PATCH 011/112] [Bug] Fix DeepGEMM Env Control (#23591) Signed-off-by: yewentao256 --- vllm/utils/deep_gemm.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index c0a4ed077e660..b0bc3a79eb0ad 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -27,7 +27,7 @@ def is_deep_gemm_supported() -> bool: is_supported_arch = current_platform.is_cuda() and ( current_platform.is_device_capability(90) or current_platform.is_device_capability(100)) - return has_deep_gemm() and is_supported_arch + return envs.VLLM_USE_DEEP_GEMM and has_deep_gemm() and is_supported_arch @functools.cache @@ -35,12 +35,9 @@ def is_blackwell_deep_gemm_e8m0_used() -> bool: """Return ``True`` if vLLM is configured to use DeepGEMM " "E8M0 scale on a Blackwell-class GPU. """ - if not (envs.VLLM_USE_DEEP_GEMM): - logger.debug_once("DeepGEMM E8M0 disabled: VLLM_USE_DEEP_GEMM=0.") - return False - - if not has_deep_gemm(): - logger.debug_once("DeepGEMM E8M0 disabled: DeepGEMM backend missing.") + if not is_deep_gemm_supported(): + logger.debug_once( + "DeepGEMM E8M0 disabled: DeepGEMM not supported on this system.") return False if not envs.VLLM_USE_DEEP_GEMM_E8M0: From 6fd45e7b8a3dc216875428835036a9008cdc0fe3 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 26 Aug 2025 10:34:12 +0800 Subject: [PATCH 012/112] [CI/Build] Use vLLM client's user agent to fetch images (#23561) Signed-off-by: DarkLight1337 --- tests/entrypoints/openai/test_vision.py | 6 ++---- tests/entrypoints/openai/test_vision_embedding.py | 3 +-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index 8259a81d7b6a1..eaa6c2c163af1 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -6,8 +6,6 @@ import json import openai import pytest import pytest_asyncio -import requests -from PIL import Image from transformers import AutoProcessor from vllm.multimodal.utils import encode_image_base64, fetch_image @@ -36,7 +34,7 @@ EXPECTED_MM_BEAM_SEARCH_RES = [ ], [ "The image shows a Venn diagram with three over", - "The image shows a Venn diagram with three intersect", + "This image shows a Venn diagram with three intersect", ], [ "This image displays a gradient of colors ranging from", @@ -88,7 +86,7 @@ def get_hf_prompt_tokens(model_name, content, image_url): "role": "user", "content": f"{placeholder}{content}", }] - images = [Image.open(requests.get(image_url, stream=True).raw)] + images = [fetch_image(image_url)] prompt = processor.tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True) diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py index 4e6a21058658b..d3cc2fac6af57 100644 --- a/tests/entrypoints/openai/test_vision_embedding.py +++ b/tests/entrypoints/openai/test_vision_embedding.py @@ -5,7 +5,6 @@ import json import pytest import requests -from PIL import Image from transformers import AutoProcessor from vllm.entrypoints.openai.protocol import EmbeddingResponse @@ -64,7 +63,7 @@ def get_hf_prompt_tokens(model_name, content, image_url): placeholder = "<|image_1|> " prompt = f"{placeholder}{content}" - images = [Image.open(requests.get(image_url, stream=True).raw)] + images = [fetch_image(image_url)] inputs = processor(prompt, images, return_tensors="pt") return inputs.input_ids.shape[1] From 6fad29b11b3680c44782cd6e5fe555779d620d6c Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Mon, 25 Aug 2025 19:34:15 -0700 Subject: [PATCH 013/112] Remove graph_pool as member of VllmBackend and argument to CUDAGraphWrapper (#23385) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Luka Govedič Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: ProExpertProg <11367180+ProExpertProg@users.noreply.github.com> Co-authored-by: Luka Govedič --- vllm/compilation/backends.py | 14 ++------------ vllm/compilation/base_static_graph.py | 5 +---- vllm/compilation/cuda_graph.py | 8 ++++---- 3 files changed, 7 insertions(+), 20 deletions(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 56494dffc96b3..fa86773d24743 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -294,13 +294,12 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter): def __init__(self, module: torch.fx.GraphModule, compile_submod_names: list[str], vllm_config: VllmConfig, - graph_pool, vllm_backend: "VllmBackend"): + vllm_backend: "VllmBackend"): super().__init__(module) from torch._guards import detect_fake_mode self.fake_mode = detect_fake_mode() self.compile_submod_names = compile_submod_names self.compilation_config = vllm_config.compilation_config - self.graph_pool = graph_pool self.vllm_config = vllm_config self.vllm_backend = vllm_backend # When True, it annoyingly dumps the torch.fx.Graph on errors. @@ -359,7 +358,6 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter): runnable=piecewise_backend, vllm_config=self.vllm_config, runtime_mode=CUDAGraphMode.PIECEWISE, - graph_pool=self.graph_pool, cudagraph_options=CUDAGraphOptions( debug_log_enable=piecewise_backend.is_first_graph, gc_disable=not piecewise_backend.is_first_graph, @@ -405,7 +403,6 @@ class VllmBackend: vllm_config: VllmConfig compilation_config: CompilationConfig - graph_pool: Any _called: bool = False # the graph we compiled graph: fx.GraphModule @@ -433,13 +430,6 @@ class VllmBackend: # them, e.g. backbone (default), eagle_head, etc. self.prefix = prefix or model_tag - global_graph_pool = current_platform.get_global_graph_pool() - - # TODO: in the future, if we want to use multiple - # streams, it might not be safe to share a global pool. - # only investigate this when we use multiple streams - self.graph_pool = global_graph_pool - # Passes to run on the graph post-grad. self.post_grad_pass_manager = PostGradPassManager() @@ -586,7 +576,7 @@ class VllmBackend: # propagate the split graph to the piecewise backend, # compile submodules with symbolic shapes PiecewiseCompileInterpreter(self.split_gm, submod_names_to_compile, - self.vllm_config, self.graph_pool, + self.vllm_config, self).run(*example_inputs) graph_path = os.path.join(local_cache_dir, "computation_graph.py") diff --git a/vllm/compilation/base_static_graph.py b/vllm/compilation/base_static_graph.py index 1c3f52c533b13..161d066ce9fb8 100644 --- a/vllm/compilation/base_static_graph.py +++ b/vllm/compilation/base_static_graph.py @@ -13,7 +13,7 @@ class AbstractStaticGraphWrapper(Protocol): """ def __init__(self, runnable: Callable, vllm_config: VllmConfig, - runtime_mode: CUDAGraphMode, graph_pool: Any, **kwargs): + runtime_mode: CUDAGraphMode, **kwargs): """ Initializes the StaticGraphWrapper class with graph capturing and execution-related configurations. @@ -25,9 +25,6 @@ class AbstractStaticGraphWrapper(Protocol): graph runtime. See CUDAGraphMode in vllm/config.py. Note that only the subset enum `NONE`, `PIECEWISE` and `FULL` are used as concrete runtime mode for cudagraph dispatching. - graph_pool (Any): - Graph memory pool handle, e.g., - `torch.cuda.graph_pool_handle()`. Keyword Args: kwargs: Additional keyword arguments for platform-specific configurations. diff --git a/vllm/compilation/cuda_graph.py b/vllm/compilation/cuda_graph.py index 65a38197ad4e2..e233f959c0a4a 100644 --- a/vllm/compilation/cuda_graph.py +++ b/vllm/compilation/cuda_graph.py @@ -67,11 +67,9 @@ class CUDAGraphWrapper: runnable: Callable, vllm_config: VllmConfig, runtime_mode: CUDAGraphMode, - graph_pool: Any = None, cudagraph_options: Optional[CUDAGraphOptions] = None): self.runnable = runnable self.vllm_config = vllm_config - self.graph_pool = graph_pool self.runtime_mode = runtime_mode self.compilation_config = vllm_config.compilation_config @@ -81,8 +79,10 @@ class CUDAGraphWrapper: # assert runtime_mode is not NONE(no cudagraph), otherwise, we don't # need to initialize a CUDAGraphWrapper. assert self.runtime_mode != CUDAGraphMode.NONE - if self.graph_pool is None: - self.graph_pool = current_platform.get_global_graph_pool() + # TODO: in the future, if we want to use multiple + # streams, it might not be safe to share a global pool. + # only investigate this when we use multiple streams + self.graph_pool = current_platform.get_global_graph_pool() if cudagraph_options is None: cudagraph_options = CUDAGraphOptions() From b395b3b0a3166d17c75e74f4eaf0ff4b15f2554f Mon Sep 17 00:00:00 2001 From: Zijing Liu Date: Mon, 25 Aug 2025 21:06:00 -0700 Subject: [PATCH 014/112] [Disagg][Perf] Use CUDA event sync instead of blocking `tolist` to avoid unintentional copy ops blocking across different CUDA streams, improving disagg TTIT/TTFT (#22760) Signed-off-by: Zijing Liu Signed-off-by: Zijing Liu --- vllm/v1/worker/gpu_model_runner.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 5d49bbaf270bb..4f6cf9a350706 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -316,6 +316,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Cached outputs. self._draft_token_ids: Optional[Union[list[list[int]], torch.Tensor]] = None + self.transfer_event = torch.cuda.Event() + self.sampled_token_ids_pinned_cpu = torch.empty( + (self.max_model_len, 1), + dtype=torch.int64, + device="cpu", + pin_memory=True) def _make_buffer(self, *args, dtype: torch.dtype) -> CpuGpuBuffer: return CpuGpuBuffer(*args, @@ -1691,7 +1697,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): max_gen_len = sampled_token_ids.shape[-1] if max_gen_len == 1: # No spec decode tokens. - valid_sampled_token_ids = sampled_token_ids.tolist() + valid_sampled_token_ids = self._to_list(sampled_token_ids) else: # Includes spec decode tokens. valid_sampled_token_ids = self.rejection_sampler.parse_output( @@ -2219,7 +2225,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): - CUDAGraphMode.PIECEWISE: Piecewise cudagraph. - CUDAGraphMode.FULL: Full cudagraph, attention metadata is needed. - force_attention: If True, always create attention metadata. Used to + force_attention: If True, always create attention metadata. Used to warm up attention backend when mode is NONE. uniform_decode: If True, the batch is a uniform decode batch. skip_eplb: If True, skip EPLB state update. @@ -3233,3 +3239,18 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): mamba_type=mamba_module.mamba_type) return kv_cache_spec + + def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]: + # This is a short term mitigation for issue mentioned in + # https://github.com/vllm-project/vllm/issues/22754. + # `tolist` would trigger a cuda wise stream sync, which + # would block other copy ops from other cuda streams. + # A cuda event sync would avoid such a situation. Since + # this is in the critical path of every single model + # forward loop, this has caused perf issue for a disagg + # setup. + pinned = self.sampled_token_ids_pinned_cpu[:sampled_token_ids.shape[0]] + pinned.copy_(sampled_token_ids, non_blocking=True) + self.transfer_event.record() + self.transfer_event.synchronize() + return pinned.tolist() From ce0e9dbd43e798d5b27a2a379aa4e13d91a279e3 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 26 Aug 2025 14:13:03 +0800 Subject: [PATCH 015/112] [CI/Build] Fix typo in #23561 (#23616) Signed-off-by: DarkLight1337 --- tests/entrypoints/openai/test_vision.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index eaa6c2c163af1..106ec121a422e 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -34,7 +34,7 @@ EXPECTED_MM_BEAM_SEARCH_RES = [ ], [ "The image shows a Venn diagram with three over", - "This image shows a Venn diagram with three intersect", + "The image shows a Venn diagram with three intersect", ], [ "This image displays a gradient of colors ranging from", From 959783fb996d0d15598f45ca12ffcbee4b681424 Mon Sep 17 00:00:00 2001 From: Bin Jia <45593998+FoolPlayer@users.noreply.github.com> Date: Tue, 26 Aug 2025 14:16:36 +0800 Subject: [PATCH 016/112] [fix] fix seed-oss-parser (#23560) Signed-off-by: jiabin.00 --- tests/tool_use/test_seed_oss_tool_parser.py | 9 ++------- .../openai/tool_parsers/seed_oss_tool_parser.py | 3 +++ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/tests/tool_use/test_seed_oss_tool_parser.py b/tests/tool_use/test_seed_oss_tool_parser.py index d85bc9bbf1b30..c276a598aa68c 100644 --- a/tests/tool_use/test_seed_oss_tool_parser.py +++ b/tests/tool_use/test_seed_oss_tool_parser.py @@ -102,9 +102,7 @@ def test_extract_tool_calls_no_tools(seed_oss_tool_parser): ], argnames=["model_output", "expected_tool_calls", "expected_content"], argvalues=[ - ("""\n\n\n""" - """The current thinking budget is 0, so I will directly start answering the question.\n\n""" - """\n\n""" + ("""\n\n""" """Barcelona, Spain\n\n""", [ ToolCall(function=FunctionCall( @@ -114,10 +112,7 @@ def test_extract_tool_calls_no_tools(seed_oss_tool_parser): }, ), ), type='function') - ], - """\n\n\n""" - """The current thinking budget is 0, so I will directly start answering the question.\n\n""" - ), + ], None), ( """The user\'s current thinking budget is 512.\nLet me analyze the """ """question. The user wants to know the weather in Barcelona, Spain. Looking at the functions available, """ diff --git a/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py index 69cf2e68f7c41..95458f07ff2a2 100644 --- a/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py @@ -271,6 +271,9 @@ class SeedOssToolParser(ToolParser): # Extract content after think end token result_content = model_output[think_end_index:] thinking_content = model_output[:think_end_index] + else: + thinking_content = "" + result_content = model_output try: function_calls = self._get_function_calls(result_content) From 7d67a9d9f93f86b74066c64c373405aa088e4a16 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 26 Aug 2025 14:50:17 +0800 Subject: [PATCH 017/112] [mypy] Fix incorrect type hint for EAGLE3 support (#23617) Signed-off-by: DarkLight1337 --- vllm/model_executor/models/llama.py | 6 +++--- vllm/model_executor/models/qwen2.py | 6 +++--- vllm/model_executor/models/qwen3.py | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index f99f1c3643fd4..e39a6df843cd4 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -353,7 +353,7 @@ class LlamaModel(nn.Module): else: self.norm = PPMissingLayer() - self.aux_hidden_state_layers: tuple[int] = tuple() + self.aux_hidden_state_layers = tuple[int, ...]() self.make_empty_intermediate_tensors = ( make_empty_intermediate_tensors_factory( @@ -553,10 +553,10 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) - def set_aux_hidden_state_layers(self, layers: tuple[int]) -> None: + def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None: self.model.aux_hidden_state_layers = layers - def get_eagle3_aux_hidden_state_layers(self) -> tuple[int]: + def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]: num_layers = len(self.model.layers) return (2, num_layers // 2, num_layers - 3) diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 801741ecaf3b8..27c1e68c6704b 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -333,7 +333,7 @@ class Qwen2Model(nn.Module): else: self.norm = PPMissingLayer() - self.aux_hidden_state_layers: tuple[int] = tuple() + self.aux_hidden_state_layers = tuple[int, ...]() def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.embed_tokens(input_ids) @@ -488,10 +488,10 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3): def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) - def set_aux_hidden_state_layers(self, layers: tuple[int]) -> None: + def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None: self.model.aux_hidden_state_layers = layers - def get_eagle3_aux_hidden_state_layers(self) -> tuple[int]: + def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]: num_layers = len(self.model.layers) return (2, num_layers // 2, num_layers - 3) diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index 2060206633702..dddb47048a1fc 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -304,10 +304,10 @@ class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) - def set_aux_hidden_state_layers(self, layers: tuple[int]) -> None: + def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None: self.model.aux_hidden_state_layers = layers - def get_eagle3_aux_hidden_state_layers(self) -> tuple[int]: + def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]: num_layers = len(self.model.layers) return (2, num_layers // 2, num_layers - 3) From 3ecbb14b814f9559bce88fa62ea8b5deedbc6076 Mon Sep 17 00:00:00 2001 From: Jiangyun Zhu Date: Tue, 26 Aug 2025 14:57:08 +0800 Subject: [PATCH 018/112] [Benchmarks] add benchmark for embedding models (#23000) Signed-off-by: zjy0516 --- vllm/benchmarks/datasets.py | 67 +++-- vllm/benchmarks/lib/endpoint_request_func.py | 57 +++- vllm/benchmarks/serve.py | 257 +++++++++++++------ 3 files changed, 274 insertions(+), 107 deletions(-) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index e586337367b1c..93519b5ba1523 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -73,7 +73,7 @@ class SampleRequest: Represents a single inference request for benchmarking. """ - prompt: Union[str, Any] + prompt: Union[str, list[str]] prompt_len: int expected_output_len: int multi_modal_data: Optional[ @@ -409,6 +409,7 @@ class RandomDataset(BenchmarkDataset): range_ratio: float = DEFAULT_RANGE_RATIO, input_len: int = DEFAULT_INPUT_LEN, output_len: int = DEFAULT_OUTPUT_LEN, + batchsize: int = 1, **kwargs, ) -> list[SampleRequest]: @@ -439,6 +440,21 @@ class RandomDataset(BenchmarkDataset): request_id=request_id_prefix + str(i), ) ) + # only used for embeddings benchmark. + if batchsize > 1: + batch_requests = [] + # Create batched requests + for i in range(0, num_requests, batchsize): + batch = requests[i : i + batchsize] + batch_requests.append( + SampleRequest( + prompt=[req.prompt for req in batch], + prompt_len=sum(req.prompt_len for req in batch), + expected_output_len=0, + request_id=request_id_prefix + str(i // batchsize), + ) + ) + requests = batch_requests return requests def get_prefix( @@ -475,8 +491,8 @@ class RandomDataset(BenchmarkDataset): input_high = math.ceil(real_input_len * (1 + range_ratio)) output_low = math.floor(output_len * (1 - range_ratio)) output_high = math.ceil(output_len * (1 + range_ratio)) - # Ensure the lower bound for output length is at least 1 to - # prevent sampling 0 tokens. + # Ensure the lower bound for output length is at least 1 to + # prevent sampling 0 tokens. output_low = max(output_low, 1) if input_low > input_high: @@ -506,7 +522,6 @@ class RandomDataset(BenchmarkDataset): size=num_requests) return input_lens, output_lens, offsets - def generate_token_sequence( self, *, @@ -1105,6 +1120,13 @@ def add_dataset_parser(parser: FlexibleArgumentParser): "context length sampled from [input_len * (1 - range_ratio), " "input_len * (1 + range_ratio)]."), ) + random_group.add_argument( + "--random-batch-size", + type=int, + default=1, + help=("Batch size for random sampling. " + "Only used for embeddings benchmark."), + ) # random multimodal dataset options random_mm_group = parser.add_argument_group( @@ -1196,8 +1218,6 @@ def add_dataset_parser(parser: FlexibleArgumentParser): ), ) - - hf_group = parser.add_argument_group("hf dataset options") hf_group.add_argument("--hf-subset", type=str, @@ -1348,22 +1368,24 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: else: # For datasets that follow a similar structure, use a mapping. dataset_mapping = { - "sharegpt": - lambda: ShareGPTDataset(random_seed=args.seed, - dataset_path=args.dataset_path).sample( - tokenizer=tokenizer, - num_requests=args.num_prompts, - output_len=args.sharegpt_output_len, - request_id_prefix=args.request_id_prefix, - ), - "burstgpt": - lambda: BurstGPTDataset(random_seed=args.seed, - dataset_path=args.dataset_path). - sample(tokenizer=tokenizer, num_requests=args.num_prompts, - request_id_prefix=args.request_id_prefix,), - "random": - lambda: RandomDataset(random_seed=args.seed, - dataset_path=args.dataset_path).sample( + "sharegpt": lambda: ShareGPTDataset( + random_seed=args.seed, dataset_path=args.dataset_path + ).sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + output_len=args.sharegpt_output_len, + request_id_prefix=args.request_id_prefix, + ), + "burstgpt": lambda: BurstGPTDataset( + random_seed=args.seed, dataset_path=args.dataset_path + ).sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + request_id_prefix=args.request_id_prefix, + ), + "random": lambda: RandomDataset( + random_seed=args.seed, dataset_path=args.dataset_path + ).sample( tokenizer=tokenizer, num_requests=args.num_prompts, prefix_len=args.random_prefix_len, @@ -1371,6 +1393,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: output_len=args.random_output_len, range_ratio=args.random_range_ratio, request_id_prefix=args.request_id_prefix, + batchsize=args.random_batch_size, ), "random-mm": lambda: RandomMultiModalDataset( diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py index 76beded4d5189..6bb2a497119e9 100644 --- a/vllm/benchmarks/lib/endpoint_request_func.py +++ b/vllm/benchmarks/lib/endpoint_request_func.py @@ -69,8 +69,8 @@ async def async_request_openai_completions( ), "OpenAI Completions API URL must end with 'completions' or 'profile'." payload = { - "model": request_func_input.model_name \ - if request_func_input.model_name else request_func_input.model, + "model": request_func_input.model_name + if request_func_input.model_name else request_func_input.model, "prompt": request_func_input.prompt, "temperature": 0.0, "repetition_penalty": 1.0, @@ -135,7 +135,7 @@ async def async_request_openai_completions( # Decoding phase else: output.itl.append(timestamp - - most_recent_timestamp) + most_recent_timestamp) most_recent_timestamp = timestamp generated_text += text or "" @@ -254,7 +254,7 @@ async def async_request_openai_chat_completions( # Decoding phase else: output.itl.append(timestamp - - most_recent_timestamp) + most_recent_timestamp) generated_text += content or "" elif usage := data.get("usage"): @@ -394,12 +394,61 @@ async def async_request_openai_audio( return output +async def async_request_openai_embeddings( + request_func_input: RequestFuncInput, + session: aiohttp.ClientSession, + pbar: Optional[tqdm] = None, +): + api_url = request_func_input.api_url + assert api_url.endswith( + "embeddings" + ), "OpenAI Embeddings API URL must end with 'embeddings'." + + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + } + + payload = { + "model": request_func_input.model, + "input": request_func_input.prompt, + } + + output = RequestFuncOutput() + st = time.perf_counter() + try: + async with session.post( + url=api_url, + headers=headers, + json=payload + ) as response: + if response.status == 200: + output.latency = time.perf_counter() - st + data = await response.json() + output.success = True + output.generated_text = "" + output.prompt_len = data.get( + "usage", {}).get( + "prompt_tokens", 0) + else: + output.success = False + output.error = response.reason or "" + except Exception as e: + output.success = False + output.error = str(e) + + if pbar: + pbar.update(1) + return output + + # TODO: Add more request functions for different API protocols. ASYNC_REQUEST_FUNCS = { "vllm": async_request_openai_completions, "openai": async_request_openai_completions, "openai-chat": async_request_openai_chat_completions, "openai-audio": async_request_openai_audio, + "openai-embeddings": async_request_openai_embeddings, } OPENAI_COMPATIBLE_BACKENDS = [ diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 79f2c475cbe5d..abb838316cd31 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -4,7 +4,7 @@ r"""Benchmark online serving throughput. On the server side, run one of the following commands to launch the vLLM OpenAI API server: - vllm serve + vllm serve On the client side, run: vllm bench serve \ @@ -26,6 +26,7 @@ import warnings from collections.abc import AsyncGenerator, Iterable from dataclasses import dataclass from datetime import datetime +from enum import Enum from typing import Any, Literal, Optional import aiohttp @@ -46,6 +47,11 @@ from vllm.transformers_utils.tokenizer import get_tokenizer MILLISECONDS_TO_SECONDS_CONVERSION = 1000 +class TaskType(Enum): + GENERATION = "generation" + EMBEDDING = "embedding" + + @dataclass class BenchmarkMetrics: completed: int @@ -75,6 +81,16 @@ class BenchmarkMetrics: std_e2el_ms: float percentiles_e2el_ms: list[tuple[float, float]] +@dataclass +class EmbedBenchmarkMetrics: + completed: int + total_input: int + request_throughput: float + total_token_throughput :float + mean_e2el_ms: float + std_e2el_ms: float + median_e2el_ms: float + percentiles_e2el_ms: float def _get_current_request_rate( ramp_up_strategy: Optional[Literal["linear", "exponential"]], @@ -146,11 +162,11 @@ async def get_request( delay_ts = [] for request_index, request in enumerate(input_requests): current_request_rate = _get_current_request_rate(ramp_up_strategy, - ramp_up_start_rps, - ramp_up_end_rps, - request_index, - total_requests, - request_rate) + ramp_up_start_rps, + ramp_up_end_rps, + request_index, + total_requests, + request_rate) request_rates.append(current_request_rate) if current_request_rate == float("inf"): delay_ts.append(0) @@ -160,7 +176,7 @@ async def get_request( # Sample the request interval from the gamma distribution. # If burstiness is 1, it follows exponential distribution. delay_ts.append(np.random.gamma(shape=burstiness, scale=theta)) - + # Calculate the cumulative delay time from the first sent out requests. for i in range(1, len(delay_ts)): delay_ts[i] += delay_ts[i - 1] @@ -170,11 +186,11 @@ async def get_request( # logic would re-scale delay time to ensure the final delay_ts # align with target_total_delay_s. # - # NOTE: If we simply accumulate the random delta values - # from the gamma distribution, their sum would have 1-2% gap + # NOTE: If we simply accumulate the random delta values + # from the gamma distribution, their sum would have 1-2% gap # from target_total_delay_s. The purpose of the following logic is to - # close the gap for stablizing the throughput data - # from different random seeds. + # close the gap for stablizing the throughput data + # from different random seeds. target_total_delay_s = total_requests / request_rate normalize_factor = target_total_delay_s / delay_ts[-1] delay_ts = [delay * normalize_factor for delay in delay_ts] @@ -189,6 +205,51 @@ async def get_request( yield request, request_rates[request_index] +def calculate_metrics_for_embeddings( + outputs: list[RequestFuncOutput], + dur_s: float, + selected_percentiles: list[float] +) -> EmbedBenchmarkMetrics: + """Calculate the metrics for the embedding requests. + + Args: + outputs: The outputs of the requests. + dur_s: The duration of the benchmark. + selected_percentiles: The percentiles to select. + + Returns: + The calculated benchmark metrics. + """ + total_input = 0 + completed = 0 + e2els: list[float] = [] + for i in range(len(outputs)): + if outputs[i].success: + e2els.append(outputs[i].latency) + completed += 1 + total_input += outputs[i].prompt_len + + if completed == 0: + warnings.warn( + "All requests failed. This is likely due to a misconfiguration " + "on the benchmark arguments.", + stacklevel=2) + metrics = EmbedBenchmarkMetrics( + completed=completed, + total_input=total_input, + request_throughput=completed / dur_s, + total_token_throughput=total_input / dur_s, + mean_e2el_ms=np.mean(e2els or 0) * 1000, + std_e2el_ms=np.std(e2els or 0) * 1000, + median_e2el_ms=np.median(e2els or 0) * 1000, + percentiles_e2el_ms=[ + (p, np.percentile(e2els or 0, p) * 1000) + for p in selected_percentiles + ], + ) + return metrics + + def calculate_metrics( input_requests: list[SampleRequest], outputs: list[RequestFuncOutput], @@ -334,8 +395,16 @@ async def benchmark( ramp_up_end_rps: Optional[int] = None, ready_check_timeout_sec: int = 600, ): + task_type = ( + TaskType.EMBEDDING + if api_url.endswith("/v1/embeddings") + else TaskType.GENERATION + ) if endpoint_type in ASYNC_REQUEST_FUNCS: - request_func = ASYNC_REQUEST_FUNCS[endpoint_type] + if task_type == TaskType.EMBEDDING: + request_func = ASYNC_REQUEST_FUNCS["openai-embeddings"] + else: + request_func = ASYNC_REQUEST_FUNCS[endpoint_type] else: raise ValueError(f"Unknown endpoint_type: {endpoint_type}") @@ -421,8 +490,8 @@ async def benchmark( if profile_output.success: print("Profiler started") - distribution = ("Poisson process" if burstiness == 1.0 - else "Gamma distribution") + distribution = ("Poisson process" if burstiness == 1.0 + else "Gamma distribution") if ramp_up_strategy is not None: print(f"Traffic ramp-up strategy: {ramp_up_strategy}.") @@ -449,7 +518,7 @@ async def benchmark( session=session, pbar=pbar) async with semaphore: - return await request_func(request_func_input=request_func_input, + return await request_func(request_func_input=request_func_input, session=session, pbar=pbar) @@ -513,14 +582,22 @@ async def benchmark( benchmark_duration = time.perf_counter() - benchmark_start_time - metrics, actual_output_lens = calculate_metrics( - input_requests=input_requests, - outputs=outputs, - dur_s=benchmark_duration, - tokenizer=tokenizer, - selected_percentiles=selected_percentiles, - goodput_config_dict=goodput_config_dict, - ) + if task_type == TaskType.GENERATION: + metrics, actual_output_lens = calculate_metrics( + input_requests=input_requests, + outputs=outputs, + dur_s=benchmark_duration, + tokenizer=tokenizer, + selected_percentiles=selected_percentiles, + goodput_config_dict=goodput_config_dict, + ) + else: + metrics = calculate_metrics_for_embeddings( + outputs=outputs, + dur_s=benchmark_duration, + selected_percentiles=selected_percentiles, + ) + actual_output_lens = 0 print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='=')) print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) @@ -529,39 +606,55 @@ async def benchmark( max_concurrency)) if request_rate != float('inf'): print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", - request_rate )) + request_rate)) print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration)) print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) - print("{:<40} {:<10}".format("Total generated tokens:", - metrics.total_output)) + if isinstance(metrics, BenchmarkMetrics): + print("{:<40} {:<10}".format( + "Total generated tokens:", metrics.total_output)) print("{:<40} {:<10.2f}".format("Request throughput (req/s):", metrics.request_throughput)) if goodput_config_dict: print("{:<40} {:<10.2f}".format("Request goodput (req/s):", metrics.request_goodput)) - print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", - metrics.output_throughput)) + if isinstance(metrics, BenchmarkMetrics): + print( + "{:<40} {:<10.2f}".format( + "Output token throughput (tok/s):", metrics.output_throughput + ) + ) print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", metrics.total_token_throughput)) - result = { - "duration": benchmark_duration, - "completed": metrics.completed, - "total_input_tokens": metrics.total_input, - "total_output_tokens": metrics.total_output, - "request_throughput": metrics.request_throughput, - "request_goodput": - metrics.request_goodput if goodput_config_dict else None, - "output_throughput": metrics.output_throughput, - "total_token_throughput": metrics.total_token_throughput, - "input_lens": [output.prompt_len for output in outputs], - "output_lens": actual_output_lens, - "ttfts": [output.ttft for output in outputs], - "itls": [output.itl for output in outputs], - "generated_texts": [output.generated_text for output in outputs], - "errors": [output.error for output in outputs], - } + if isinstance(metrics, BenchmarkMetrics): + result = { + "duration": benchmark_duration, + "completed": metrics.completed, + "total_input_tokens": metrics.total_input, + "total_output_tokens": metrics.total_output, + "request_throughput": metrics.request_throughput, + "request_goodput": + metrics.request_goodput if goodput_config_dict else None, + "output_throughput": metrics.output_throughput, + "total_token_throughput": metrics.total_token_throughput, + "input_lens": [output.prompt_len for output in outputs], + "output_lens": actual_output_lens, + "ttfts": [output.ttft for output in outputs], + "itls": [output.itl for output in outputs], + "generated_texts": [output.generated_text for output in outputs], + "errors": [output.error for output in outputs], + } + else: + result = { + "duration": benchmark_duration, + "completed": metrics.completed, + "total_input_tokens": metrics.total_input, + "request_throughput": metrics.request_throughput, + "total_token_throughput": metrics.total_token_throughput, + "input_lens": [output.prompt_len for output in outputs], + "errors": [output.error for output in outputs], + } if rps_change_events: result["rps_change_events"] = rps_change_events @@ -598,10 +691,11 @@ async def benchmark( value)) result[f"p{p_word}_{metric_attribute_name}_ms"] = value - process_one_metric("ttft", "TTFT", "Time to First Token") - process_one_metric("tpot", "TPOT", - "Time per Output Token (excl. 1st token)") - process_one_metric("itl", "ITL", "Inter-token Latency") + if task_type == TaskType.GENERATION: + process_one_metric("ttft", "TTFT", "Time to First Token") + process_one_metric( + "tpot", "TPOT", "Time per Output Token (excl. 1st token)") + process_one_metric("itl", "ITL", "Inter-token Latency") process_one_metric("e2el", "E2EL", "End-to-end Latency") print("=" * 50) @@ -732,7 +826,8 @@ def add_cli_args(parser: argparse.ArgumentParser): "initiated, this argument will control how many are actually allowed " "to execute at a time. This means that when used in combination, the " "actual request rate may be lower than specified with --request-rate, " - "if the server is not processing requests fast enough to keep up.") + "if the server is not processing requests fast enough to keep up.", + ) parser.add_argument( "--model", @@ -743,8 +838,7 @@ def add_cli_args(parser: argparse.ArgumentParser): parser.add_argument( "--tokenizer", type=str, - help= - "Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 + help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 ) parser.add_argument("--use-beam-search", action="store_true") parser.add_argument( @@ -968,6 +1062,7 @@ def add_cli_args(parser: argparse.ArgumentParser): def main(args: argparse.Namespace) -> dict[str, Any]: return asyncio.run(main_async(args)) + async def main_async(args: argparse.Namespace) -> dict[str, Any]: print(args) random.seed(args.seed) @@ -1046,32 +1141,32 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: gc.freeze() benchmark_result = await benchmark( - endpoint_type=args.endpoint_type, - api_url=api_url, - base_url=base_url, - model_id=model_id, - model_name=model_name, - tokenizer=tokenizer, - input_requests=input_requests, - logprobs=args.logprobs, - request_rate=args.request_rate, - burstiness=args.burstiness, - disable_tqdm=args.disable_tqdm, - profile=args.profile, - selected_percentile_metrics=args.percentile_metrics.split(","), - selected_percentiles=[ - float(p) for p in args.metric_percentiles.split(",") - ], - ignore_eos=args.ignore_eos, - goodput_config_dict=goodput_config_dict, - max_concurrency=args.max_concurrency, - lora_modules=args.lora_modules, - extra_body=sampling_params, - ramp_up_strategy=args.ramp_up_strategy, - ramp_up_start_rps=args.ramp_up_start_rps, - ramp_up_end_rps=args.ramp_up_end_rps, - ready_check_timeout_sec=args.ready_check_timeout_sec, - ) + endpoint_type=args.endpoint_type, + api_url=api_url, + base_url=base_url, + model_id=model_id, + model_name=model_name, + tokenizer=tokenizer, + input_requests=input_requests, + logprobs=args.logprobs, + request_rate=args.request_rate, + burstiness=args.burstiness, + disable_tqdm=args.disable_tqdm, + profile=args.profile, + selected_percentile_metrics=args.percentile_metrics.split(","), + selected_percentiles=[ + float(p) for p in args.metric_percentiles.split(",") + ], + ignore_eos=args.ignore_eos, + goodput_config_dict=goodput_config_dict, + max_concurrency=args.max_concurrency, + lora_modules=args.lora_modules, + extra_body=sampling_params, + ramp_up_strategy=args.ramp_up_strategy, + ramp_up_start_rps=args.ramp_up_start_rps, + ramp_up_end_rps=args.ramp_up_end_rps, + ready_check_timeout_sec=args.ready_check_timeout_sec, + ) # Save config and results to json result_json: dict[str, Any] = {} @@ -1098,7 +1193,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: # Traffic result_json["request_rate"] = (args.request_rate if args.request_rate - < float("inf") else "inf") + < float("inf") else "inf") result_json["burstiness"] = args.burstiness result_json["max_concurrency"] = args.max_concurrency @@ -1132,7 +1227,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: if args.max_concurrency is not None else "") label = label or endpoint_type if args.ramp_up_strategy is not None: - file_name = f"{label}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa + file_name = f"{label}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa else: file_name = f"{label}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa if args.result_filename: @@ -1149,4 +1244,4 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: json.dump(result_json, outfile) save_to_pytorch_benchmark_format(args, result_json, file_name) - return result_json \ No newline at end of file + return result_json From bfc1edc9f5bde581e0eec5c830a5a4a7b710fe6a Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 26 Aug 2025 08:16:44 +0100 Subject: [PATCH 019/112] [Docs] Fix titles for multi-file examples that are rendered in the docs (#23573) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/mkdocs/hooks/generate_examples.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py index 1e8b848db46d8..881df791698e2 100644 --- a/docs/mkdocs/hooks/generate_examples.py +++ b/docs/mkdocs/hooks/generate_examples.py @@ -70,6 +70,10 @@ class Example: self.other_files = self.determine_other_files() self.title = self.determine_title() + @property + def is_code(self) -> bool: + return self.main_file.suffix != ".md" + def determine_main_file(self) -> Path: """ Determines the main file in the given path. @@ -101,6 +105,12 @@ class Example: return [file for file in self.path.rglob("*") if is_other_file(file)] def determine_title(self) -> str: + if not self.is_code: + with open(self.main_file) as f: + first_line = f.readline().strip() + match = re.match(r'^#\s+(?P.+)$', first_line) + if match: + return match.group('title') return fix_case(self.path.stem.replace("_", " ").title()) def generate(self) -> str: @@ -110,11 +120,13 @@ class Example: # Use long code fence to avoid issues with # included files containing code fences too code_fence = "``````" - is_code = self.main_file.suffix != ".md" - if is_code: + # Skip the title from md snippets as it's been included above + start_line = 2 + if self.is_code: content += f"{code_fence}{self.main_file.suffix[1:]}\n" - content += f'--8<-- "{self.main_file}"\n' - if is_code: + start_line = 1 + content += f'--8<-- "{self.main_file}:{start_line}"\n' + if self.is_code: content += f"{code_fence}\n" content += "\n" From ff77764f868290bf746d101d3998095b73e7811d Mon Sep 17 00:00:00 2001 From: Raghavan <oneraghavan@gmail.com> Date: Tue, 26 Aug 2025 13:35:37 +0530 Subject: [PATCH 020/112] Fix CLI parameter documentation inconsistency in pooling_models.md (#23630) --- docs/models/pooling_models.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index 39f209d0eb7ed..753d8bd0b8339 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -205,12 +205,12 @@ Our [OpenAI-Compatible Server](../serving/openai_compatible_server.md) provides There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, if `is_matryoshka` is `True` in `config.json,` it is allowed to change the output to arbitrary dimensions. Using `matryoshka_dimensions` can control the allowed output dimensions. -For models that support Matryoshka Embeddings but not recognized by vLLM, please manually override the config using `hf_overrides={"is_matryoshka": True}`, `hf_overrides={"matryoshka_dimensions": [<allowed output dimensions>]}` (offline) or `--hf_overrides '{"is_matryoshka": true}'`, `--hf_overrides '{"matryoshka_dimensions": [<allowed output dimensions>]}'`(online). +For models that support Matryoshka Embeddings but not recognized by vLLM, please manually override the config using `hf_overrides={"is_matryoshka": True}`, `hf_overrides={"matryoshka_dimensions": [<allowed output dimensions>]}` (offline) or `--hf-overrides '{"is_matryoshka": true}'`, `--hf-overrides '{"matryoshka_dimensions": [<allowed output dimensions>]}'`(online). Here is an example to serve a model with Matryoshka Embeddings enabled. ```text -vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf_overrides '{"matryoshka_dimensions":[256]}' +vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf-overrides '{"matryoshka_dimensions":[256]}' ``` ### Offline Inference From 9b5f64238fbd0f98928587b3426cbf69eea96ae7 Mon Sep 17 00:00:00 2001 From: Jee Jee Li <pandaleefree@gmail.com> Date: Tue, 26 Aug 2025 16:09:14 +0800 Subject: [PATCH 021/112] [Bugfix] Fix Qwen25VL packed_modules_mapping (#23604) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> --- vllm/model_executor/models/qwen2_5_vl.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 0f11636ce3bd3..648ba81eb3877 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -853,6 +853,7 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsQuant): packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], } From b5d34af3286ee0334d9f7bd729774ac55c5805e9 Mon Sep 17 00:00:00 2001 From: Roger Wang <hey@rogerw.io> Date: Tue, 26 Aug 2025 02:46:28 -0700 Subject: [PATCH 022/112] [Bugfix] Fix scheduling when repeated images in one request (#23544) Signed-off-by: Roger Wang <hey@rogerw.me> Signed-off-by: Roger Wang <hey@rogerw.io> Co-authored-by: Roger Wang <hey@rogerw.me> Co-authored-by: knlnguyen1802 <knlnguyen1802@gmail.com> --- tests/v1/core/test_encoder_cache_manager.py | 49 ++++++++++++++----- vllm/v1/core/encoder_cache_manager.py | 32 +++++++----- vllm/v1/core/sched/scheduler.py | 54 +++++++++++++++------ 3 files changed, 96 insertions(+), 39 deletions(-) diff --git a/tests/v1/core/test_encoder_cache_manager.py b/tests/v1/core/test_encoder_cache_manager.py index 60d932a878abb..ae5b751f45a4b 100644 --- a/tests/v1/core/test_encoder_cache_manager.py +++ b/tests/v1/core/test_encoder_cache_manager.py @@ -22,7 +22,7 @@ def test_basic_allocate_and_reuse(): req = MockRequest("r1", ["imgA"], [4]) assert not cache.check_and_update_cache(req, 0) - assert cache.try_allocate(req, 0, int(1e9)) + assert cache.can_allocate(req, 0, int(1e9), 0) cache.allocate(req, 0) @@ -44,7 +44,7 @@ def test_freeing_decreases_refcount_and_moves_to_freeable(): manager = EncoderCacheManager(cache_size=10) req = MockRequest("req2", ["img3"], [5]) - assert manager.try_allocate(req, 0, int(1e9)) + assert manager.can_allocate(req, 0, int(1e9), 0) manager.allocate(req, 0) assert len(manager.cached["img3"]) == 1 @@ -60,10 +60,10 @@ def test_free_request_frees_all_inputs(): manager = EncoderCacheManager(cache_size=10) req = MockRequest("req3", ["a", "b"], [2, 3]) - assert manager.try_allocate(req, 0, int(1e9)) + assert manager.can_allocate(req, 0, int(1e9), 0) manager.allocate(req, 0) - assert manager.try_allocate(req, 1, int(1e9)) + assert manager.can_allocate(req, 1, int(1e9), 0) manager.allocate(req, 1) assert len(manager.cached["a"]) == 1 @@ -84,11 +84,11 @@ def test_eviction_when_cache_is_full(): req1 = MockRequest("req1", ["x"], [6]) req2 = MockRequest("req2", ["y"], [5]) - assert manager.try_allocate(req1, 0, int(1e9)) + assert manager.can_allocate(req1, 0, int(1e9), 0) manager.allocate(req1, 0) manager.free_encoder_input(req1, 0) - assert manager.try_allocate(req2, 0, int(1e9)) + assert manager.can_allocate(req2, 0, int(1e9), 0) manager.allocate(req2, 0) # 'x' should have been evicted. @@ -100,10 +100,10 @@ def test_get_cached_input_ids(): manager = EncoderCacheManager(cache_size=10) req = MockRequest("reqX", ["m", "n", "o"], [2, 4, 3]) - assert manager.try_allocate(req, 0, int(1e9)) + assert manager.can_allocate(req, 0, int(1e9), 0) manager.allocate(req, 0) - assert manager.try_allocate(req, 2, int(1e9)) + assert manager.can_allocate(req, 2, int(1e9), 0) manager.allocate(req, 2) cached_ids = manager.get_cached_input_ids(req) @@ -114,7 +114,7 @@ def test_has_cache_restores_from_freeable(): manager = EncoderCacheManager(cache_size=10) req = MockRequest("reqY", ["imgZ"], [4]) - assert manager.try_allocate(req, 0, int(1e9)) + assert manager.can_allocate(req, 0, int(1e9), 0) manager.allocate(req, 0) manager.free_encoder_input(req, 0) @@ -131,14 +131,41 @@ def test_get_freed_mm_hashes_clears_freed_list(): req1 = MockRequest("reqA", ["a"], [5]) req2 = MockRequest("reqB", ["b"], [6]) - assert manager.try_allocate(req1, 0, int(1e9)) + assert manager.can_allocate(req1, 0, int(1e9), 0) manager.allocate(req1, 0) manager.free_encoder_input(req1, 0) # Should trigger eviction of 'a'. - assert manager.try_allocate(req2, 0, int(1e9)) + assert manager.can_allocate(req2, 0, int(1e9), 0) manager.allocate(req2, 0) freed = manager.get_freed_mm_hashes() assert "a" in freed assert manager.get_freed_mm_hashes() == [] + + +def test_schedule_request_multi_images_respect_space_limit(): + manager = EncoderCacheManager(cache_size=10) + req = MockRequest("reqA", ["a", "b"], [5, 6]) + compute_budget = 100 + + num_tokens_to_schedule = 0 + assert manager.can_allocate(req, 0, compute_budget, num_tokens_to_schedule) + num_tokens_to_schedule += req.get_num_encoder_tokens(0) + compute_budget -= req.get_num_encoder_tokens(0) + + assert not manager.can_allocate(req, 1, compute_budget, + num_tokens_to_schedule) + + +def test_schedule_request_multi_images_respect_compute_limit(): + manager = EncoderCacheManager(cache_size=100) + req = MockRequest("reqA", ["a", "b"], [5, 6]) + compute_budget = 10 + num_tokens_to_schedule = 0 + assert manager.can_allocate(req, 0, compute_budget, num_tokens_to_schedule) + num_tokens_to_schedule += req.get_num_encoder_tokens(0) + compute_budget -= req.get_num_encoder_tokens(0) + + assert not manager.can_allocate(req, 1, compute_budget, + num_tokens_to_schedule) diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py index 70af419fcb955..c9d18033a1988 100644 --- a/vllm/v1/core/encoder_cache_manager.py +++ b/vllm/v1/core/encoder_cache_manager.py @@ -99,8 +99,9 @@ class EncoderCacheManager: self.cached[mm_hash].add(request.request_id) return True - def try_allocate(self, request: Request, input_id: int, - encoder_budget: int) -> bool: + def can_allocate(self, request: Request, input_id: int, + encoder_compute_budget: int, + num_tokens_to_schedule: int) -> bool: """Check if there's sufficient cache space for a multimodal input. If there is, return True and update EncoderCacheManager state. @@ -116,6 +117,10 @@ class EncoderCacheManager: Args: request: The request containing the multimodal input. input_id: Index of the multimodal input within the request. + encoder_compute_budget: Number of encoder tokens allowed to be + computed when this method is invoked. + num_tokens_to_schedule: Number of tokens already scheduled to be + allocated with cache space when this method is invoked. Returns: True if there's enough capacity to hold the encoder output for this @@ -128,13 +133,13 @@ class EncoderCacheManager: num_tokens = request.get_num_encoder_tokens(input_id) # Not enough compute budget - if num_tokens > encoder_budget: + if num_tokens > encoder_compute_budget: return False + num_tokens += num_tokens_to_schedule + # Enough free slots if num_tokens <= self.num_free_slots: - self.num_free_slots -= num_tokens - self.num_freeable_slots -= num_tokens return True # Not enough reclaimable slots @@ -149,8 +154,6 @@ class EncoderCacheManager: del self.cached[mm_hash] self.freed.append(mm_hash) self.num_free_slots += num_free_token - self.num_free_slots -= num_tokens - self.num_freeable_slots -= num_tokens return True def allocate(self, request: Request, input_id: int) -> None: @@ -161,19 +164,24 @@ class EncoderCacheManager: the model runner; this method updates the manager's bookkeeping. Note: - This method assumes try_allocate() returned True for the same input. + This method assumes can_allocate() returned True for the same input. """ - # Encoder cache space budget should be already updated for the - # multimodal input and non-negative after try_allocate() is called. - assert self.num_free_slots >= 0 - assert self.num_freeable_slots >= 0 mm_hash = request.mm_hashes[input_id] request_id = request.request_id if mm_hash not in self.cached: self.cached[mm_hash] = set() + num_encoder_tokens = request.get_num_encoder_tokens(input_id) + + # NOTE: Encoder cache should always have enough space for encoder inputs + # that are scheduled since eviction takes place at can_allocate(). + assert self.num_free_slots >= num_encoder_tokens + assert self.num_freeable_slots >= num_encoder_tokens + self.cached[mm_hash].add(request_id) + self.num_free_slots -= num_encoder_tokens + self.num_freeable_slots -= num_encoder_tokens def get_cached_input_ids(self, request: Request) -> set[int]: """Get all cached multimodal input IDs for a request. diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 956e23afa0d73..522b340b32aaf 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -182,7 +182,7 @@ class Scheduler(SchedulerInterface): token_budget = self.max_num_scheduled_tokens # Encoder-related. scheduled_encoder_inputs: dict[str, list[int]] = {} - encoder_budget = self.max_num_encoder_input_tokens + encoder_compute_budget = self.max_num_encoder_input_tokens # Spec decode-related. scheduled_spec_decode_tokens: dict[str, list[int]] = {} @@ -211,12 +211,13 @@ class Scheduler(SchedulerInterface): # Schedule encoder inputs. encoder_inputs_to_schedule = None - new_encoder_budget = encoder_budget + new_encoder_compute_budget = encoder_compute_budget if request.has_encoder_inputs: (encoder_inputs_to_schedule, num_new_tokens, - new_encoder_budget) = self._try_schedule_encoder_inputs( + new_encoder_compute_budget + ) = self._try_schedule_encoder_inputs( request, request.num_computed_tokens, num_new_tokens, - encoder_budget) + encoder_compute_budget) if num_new_tokens == 0: # The request cannot be scheduled because one of the following @@ -298,7 +299,7 @@ class Scheduler(SchedulerInterface): # Allocate the encoder cache. for i in encoder_inputs_to_schedule: self.encoder_cache_manager.allocate(request, i) - encoder_budget = new_encoder_budget + encoder_compute_budget = new_encoder_compute_budget # Record the LoRAs in scheduled_running_reqs scheduled_loras: set[int] = set() @@ -382,7 +383,7 @@ class Scheduler(SchedulerInterface): num_computed_tokens = request.num_computed_tokens encoder_inputs_to_schedule = None - new_encoder_budget = encoder_budget + new_encoder_compute_budget = encoder_compute_budget # KVTransfer: loading remote KV, do not allocate for new work. if load_kv_async: @@ -413,10 +414,10 @@ class Scheduler(SchedulerInterface): # Schedule encoder inputs. if request.has_encoder_inputs: (encoder_inputs_to_schedule, num_new_tokens, - new_encoder_budget + new_encoder_compute_budget ) = self._try_schedule_encoder_inputs( request, num_computed_tokens, num_new_tokens, - encoder_budget) + encoder_compute_budget) if num_new_tokens == 0: # The request cannot be scheduled. break @@ -495,7 +496,7 @@ class Scheduler(SchedulerInterface): # Allocate the encoder cache. for i in encoder_inputs_to_schedule: self.encoder_cache_manager.allocate(request, i) - encoder_budget = new_encoder_budget + encoder_compute_budget = new_encoder_compute_budget # Put back any skipped requests at the head of the waiting queue if skipped_waiting_requests: @@ -658,7 +659,7 @@ class Scheduler(SchedulerInterface): request: Request, num_computed_tokens: int, num_new_tokens: int, - encoder_budget: int, + encoder_compute_budget: int, ) -> tuple[list[int], int, int]: """ Determine which encoder inputs need to be scheduled in the current step, @@ -680,11 +681,17 @@ class Scheduler(SchedulerInterface): blocks and externally cached blocks (via KVConnector). """ if num_new_tokens == 0 or not request.has_encoder_inputs: - return [], num_new_tokens, encoder_budget + return [], num_new_tokens, encoder_compute_budget encoder_inputs_to_schedule: list[int] = [] mm_positions = request.mm_positions assert mm_positions is not None assert len(mm_positions) > 0 + + # NOTE: since scheduler operates on the request level (possibly with + # multiple encoder inputs per request), we need to create temporary + # trackers for accounting at the encoder input level. + mm_hashes_to_schedule = set() + num_tokens_to_schedule = 0 for i, pos_info in enumerate(mm_positions): start_pos = pos_info.offset num_encoder_tokens = pos_info.length @@ -695,13 +702,20 @@ class Scheduler(SchedulerInterface): if start_pos >= num_computed_tokens + num_new_tokens: # The encoder input is not needed in this step. break + if start_pos + num_encoder_tokens <= num_computed_tokens: # The encoder input is already computed and stored # in the decoder's KV cache. continue + # The same encoder input has already been scheduled in the current + # step. + if request.mm_hashes[i] in mm_hashes_to_schedule: + continue + if self.encoder_cache_manager.check_and_update_cache(request, i): - # The encoder input is already computed and cached. + # The encoder input is already computed and cached from a + # previous step. continue # If no encoder input chunking is allowed, we do not want to @@ -714,8 +728,9 @@ class Scheduler(SchedulerInterface): num_new_tokens = start_pos - num_computed_tokens break - if not self.encoder_cache_manager.try_allocate( - request, i, encoder_budget): + if not self.encoder_cache_manager.can_allocate( + request, i, encoder_compute_budget, + num_tokens_to_schedule): # The encoder cache is full or the encoder budget is exhausted. # NOTE(woosuk): We assume that the encoder input tokens should # be processed altogether, as the encoder usually uses @@ -732,9 +747,16 @@ class Scheduler(SchedulerInterface): num_new_tokens = 0 break - encoder_budget -= num_encoder_tokens + num_tokens_to_schedule += num_encoder_tokens + encoder_compute_budget -= num_encoder_tokens + mm_hashes_to_schedule.add(request.mm_hashes[i]) encoder_inputs_to_schedule.append(i) - return encoder_inputs_to_schedule, num_new_tokens, encoder_budget + + return ( + encoder_inputs_to_schedule, + num_new_tokens, + encoder_compute_budget, + ) def get_grammar_bitmask( self, From 50fede6634a997f4e971ecb4eb4cce337340e394 Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Tue, 26 Aug 2025 18:00:18 +0800 Subject: [PATCH 023/112] [V1] Enable V1 for compute capability < 8.0 + FP32 (#23614) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- vllm/engine/arg_utils.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 3ab1115f14462..f24c50ad73261 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1433,15 +1433,15 @@ class EngineArgs: recommend_to_remove=True) return False - # Need at least Ampere for now (FA support required). - # Skip this check if we are running on a non-GPU platform, - # or if the device capability is not available - # (e.g. in a Ray actor without GPUs). + # Triton v3.3 has f16 conversion regression issue on Turing and Volta, + # which broke fp16 inference + # see: https://github.com/triton-lang/triton/issues/6698 if (current_platform.is_cuda() - and current_platform.get_device_capability() - and current_platform.get_device_capability().major < 8): - _raise_or_fallback(feature_name="Compute Capability < 8.0", - recommend_to_remove=False) + and not current_platform.has_device_capability(80) + and model_config.dtype == torch.float16): + _raise_or_fallback( + feature_name="Compute Capability < 8.0 with FP16", + recommend_to_remove=False) return False if self.kv_cache_dtype != "auto": From b00e69f8ca55f4a82847d39466f57ceb748324c1 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 26 Aug 2025 11:27:20 +0100 Subject: [PATCH 024/112] Fix nits from #20059 (#23548) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/compilation.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index e2785e7602e45..56aa00a30d3ae 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -225,7 +225,8 @@ class CompilationConfig: # CudaGraph compilation cudagraph_mode: Optional[CUDAGraphMode] = None """ - The mode of the cudagraph. + The mode of the cudagraph: + - NONE, no cudagraph capture. - PIECEWISE. (v1 default) - FULL. @@ -384,13 +385,10 @@ class CompilationConfig: if pass_config_exclude: exclude["pass_config"] = pass_config_exclude - # The cast to string is necessary because Pydantic is mocked in docs - # builds and sphinx-argparse doesn't know the return type of decode() - return str( - TypeAdapter(CompilationConfig).dump_json( - self, - exclude=exclude, # type: ignore[arg-type] - exclude_unset=True).decode()) + return TypeAdapter(CompilationConfig).dump_json( + self, + exclude=exclude, # type: ignore[arg-type] + exclude_unset=True).decode() __str__ = __repr__ From 6ace2f72b03fe41475d7d64e2bfd40b79c447f5b Mon Sep 17 00:00:00 2001 From: Huy Do <huydhn@gmail.com> Date: Tue, 26 Aug 2025 04:16:09 -0700 Subject: [PATCH 025/112] Fix writing benchmark results with tuple keys (#23633) Signed-off-by: Huy Do <huydhn@gmail.com> --- vllm/benchmarks/lib/utils.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/benchmarks/lib/utils.py b/vllm/benchmarks/lib/utils.py index 5f95fdcc75829..0c27687dcf16d 100644 --- a/vllm/benchmarks/lib/utils.py +++ b/vllm/benchmarks/lib/utils.py @@ -54,7 +54,12 @@ class InfEncoder(json.JSONEncoder): def clear_inf(self, o: Any): if isinstance(o, dict): - return {k: self.clear_inf(v) for k, v in o.items()} + return { + str(k) + if not isinstance(k, (str, int, float, bool, type(None))) + else k: self.clear_inf(v) + for k, v in o.items() + } elif isinstance(o, list): return [self.clear_inf(v) for v in o] elif isinstance(o, float) and math.isinf(o): From d52358c1e07768266e3db92e847cd28af87ca4b9 Mon Sep 17 00:00:00 2001 From: Michael Goin <mgoin64@gmail.com> Date: Tue, 26 Aug 2025 07:16:33 -0400 Subject: [PATCH 026/112] [Perf] Remove duplicated NVFP4 blockscales to save memory (#23379) Signed-off-by: mgoin <mgoin64@gmail.com> --- .../compressed_tensors_moe.py | 20 +++++------ .../schemes/compressed_tensors_w4a4_nvfp4.py | 11 +++--- .../layers/quantization/modelopt.py | 34 ++++++++----------- 3 files changed, 30 insertions(+), 35 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 7bc35cd81ac3f..1ee3478aa4f43 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -246,13 +246,13 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): return # swizzle weight scales - layer.w13_blockscale_swizzled = torch.nn.Parameter(swizzle_blockscale( + layer.w13_weight_scale = torch.nn.Parameter(swizzle_blockscale( layer.w13_weight_scale), - requires_grad=False) + requires_grad=False) - layer.w2_blockscale_swizzled = torch.nn.Parameter(swizzle_blockscale( + layer.w2_weight_scale = torch.nn.Parameter(swizzle_blockscale( layer.w2_weight_scale), - requires_grad=False) + requires_grad=False) # w13 w13_input_global_scale = layer.w13_input_global_scale.max( @@ -383,8 +383,8 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): activation=activation, global_num_experts=global_num_experts, expert_map=expert_map, - w1_scale=layer.w13_blockscale_swizzled, - w2_scale=layer.w2_blockscale_swizzled, + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, apply_router_weight_on_input=apply_router_weight_on_input, ) @@ -406,8 +406,8 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): activation=activation, global_num_experts=global_num_experts, expert_map=expert_map, - w1_scale=layer.w13_blockscale_swizzled, - w2_scale=layer.w2_blockscale_swizzled, + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, g1_alphas=layer.g1_alphas, g2_alphas=layer.g2_alphas, a1_gscale=layer.w13_input_scale_quant, @@ -427,8 +427,8 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): a=x, w1_fp4=layer.w13_weight, w2_fp4=layer.w2_weight, - w1_blockscale=layer.w13_blockscale_swizzled, - w2_blockscale=layer.w2_blockscale_swizzled, + w1_blockscale=layer.w13_weight_scale, + w2_blockscale=layer.w2_weight_scale, g1_alphas=layer.g1_alphas, g2_alphas=layer.g2_alphas, a1_gscale=layer.w13_input_scale_quant, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py index 49d76bbeaa3a1..dedd681f15ded 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py @@ -112,13 +112,12 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme): torch.uint8), epilogue_tile_m).reshape( weight_scale.shape).view(torch.float8_e4m3fn)) - layer.weight_scale_swizzled = Parameter(weight_scale, - requires_grad=False) + layer.weight_scale = Parameter(weight_scale, requires_grad=False) layer.weight_packed = Parameter(weight, requires_grad=False) else: swizzled_weight_scale = swizzle_blockscale(layer.weight_scale) - layer.weight_scale_swizzled = Parameter(swizzled_weight_scale, - requires_grad=False) + layer.weight_scale = Parameter(swizzled_weight_scale, + requires_grad=False) layer.weight_packed = Parameter(layer.weight_packed.data, requires_grad=False) @@ -136,7 +135,7 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme): x=x, input_global_scale=layer.input_global_scale, weight=layer.weight_packed, - weight_scale_swizzled=layer.weight_scale_swizzled, + weight_scale_swizzled=layer.weight_scale, weight_global_scale=layer.weight_global_scale) if bias is not None: out = out + bias @@ -149,7 +148,7 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme): x_fp4, x_blockscale = scaled_fp4_quant(x, layer.input_global_scale) mm_args = (x_fp4, layer.weight_packed, x_blockscale, - layer.weight_scale_swizzled, layer.alpha, output_dtype) + layer.weight_scale, layer.alpha, output_dtype) if self.backend == "flashinfer-trtllm": out = flashinfer_scaled_fp4_mm(*mm_args, backend="trtllm") elif self.backend == "flashinfer-cutlass": diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 046234057f04a..72864853f7e0c 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -907,20 +907,18 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase): torch.uint8), epilogue_tile_m).reshape( weight_scale.shape).view(torch.float8_e4m3fn)) - layer.weight_scale_swizzled = Parameter(weight_scale, - requires_grad=False) + layer.weight_scale = Parameter(weight_scale, requires_grad=False) layer.weight = Parameter(weight, requires_grad=False) else: swizzled_weight_scale = swizzle_blockscale(layer.weight_scale) - layer.weight_scale_swizzled = Parameter(swizzled_weight_scale, - requires_grad=False) + layer.weight_scale = Parameter(swizzled_weight_scale, + requires_grad=False) layer.weight = Parameter(layer.weight.data, requires_grad=False) if self.backend == "marlin": prepare_fp4_layer_for_marlin(layer) del layer.alpha del layer.input_scale - del layer.weight_scale_swizzled def apply( self, @@ -951,14 +949,14 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase): assert (x_fp4.dtype == torch.uint8) assert (layer.weight.dtype == torch.uint8) assert (x_blockscale.dtype == torch.float8_e4m3fn) - assert (layer.weight_scale_swizzled.dtype == torch.float8_e4m3fn) + assert (layer.weight_scale.dtype == torch.float8_e4m3fn) assert (layer.alpha.dtype == torch.float32) mm_args = ( x_fp4, layer.weight, x_blockscale, - layer.weight_scale_swizzled, + layer.weight_scale, layer.alpha, output_dtype, ) @@ -1320,16 +1318,16 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): "Weight Blockscale must be represented as FP8-E4M3") w13_blockscale_swizzled = swizzle_blockscale( layer.w13_weight_scale) - layer.w13_blockscale_swizzled = Parameter(w13_blockscale_swizzled, - requires_grad=False) + layer.w13_weight_scale = Parameter(w13_blockscale_swizzled, + requires_grad=False) assert (layer.w2_weight_scale.shape[2] % 16 == 0), ( "Expected weight_scale.dim(1) to be divisible by 16") assert (layer.w2_weight_scale.dtype == torch.float8_e4m3fn), ( "Weight Blockscale must be represented as FP8-E4M3") w2_blockscale_swizzled = swizzle_blockscale(layer.w2_weight_scale) - layer.w2_blockscale_swizzled = Parameter(w2_blockscale_swizzled, - requires_grad=False) + layer.w2_weight_scale = Parameter(w2_blockscale_swizzled, + requires_grad=False) layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False) @@ -1339,8 +1337,6 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): del layer.g2_alphas del layer.w13_input_scale_quant del layer.w2_input_scale_quant - del layer.w13_blockscale_swizzled - del layer.w2_blockscale_swizzled def apply( self, @@ -1474,8 +1470,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): activation=activation, global_num_experts=global_num_experts, expert_map=expert_map, - w1_scale=layer.w13_blockscale_swizzled, - w2_scale=layer.w2_blockscale_swizzled, + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, apply_router_weight_on_input=apply_router_weight_on_input, ) elif (self.allow_flashinfer @@ -1489,8 +1485,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): w2=layer.w2_weight, topk_weights=topk_weights, topk_ids=topk_ids, - w1_scale=layer.w13_blockscale_swizzled, - w2_scale=layer.w2_blockscale_swizzled, + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, g1_alphas=layer.g1_alphas, g2_alphas=layer.g2_alphas, a1_gscale=layer.w13_input_scale_quant, @@ -1510,8 +1506,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): a=x, w1_fp4=layer.w13_weight, w2_fp4=layer.w2_weight, - w1_blockscale=layer.w13_blockscale_swizzled, - w2_blockscale=layer.w2_blockscale_swizzled, + w1_blockscale=layer.w13_weight_scale, + w2_blockscale=layer.w2_weight_scale, g1_alphas=layer.g1_alphas, g2_alphas=layer.g2_alphas, a1_gscale=layer.w13_input_scale_quant, From fdeb3dac132c9ef92d981dd811529e6496781b07 Mon Sep 17 00:00:00 2001 From: Jee Jee Li <pandaleefree@gmail.com> Date: Tue, 26 Aug 2025 20:09:47 +0800 Subject: [PATCH 027/112] [Model] fix DeepSeek e_score_correction_bias dtype to fp32 (#23640) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> --- vllm/model_executor/models/deepseek_v2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index d56224b4b7b30..7657e7cb003d6 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -126,7 +126,7 @@ class DeepseekV2MoE(nn.Module): prefix=f"{prefix}.gate") if config.topk_method == "noaux_tc": self.gate.e_score_correction_bias = nn.Parameter( - torch.empty(config.n_routed_experts)) + torch.empty(config.n_routed_experts, dtype=torch.float32)) else: self.gate.e_score_correction_bias = None From 384dd1b0a899c6761010b42aefe1159c8062f0a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C3=BA=C5=A1=20N=C3=A1me=C5=A1n=C3=BD?= <matus@namesny.com> Date: Tue, 26 Aug 2025 14:13:15 +0200 Subject: [PATCH 028/112] [Bugfix] Add missing enable_log_outputs parameter to init_app_state function (#23634) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Matúš Námešný <matus.namesny@ameria.com> --- vllm/entrypoints/openai/api_server.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 14ba8aa641837..db02767fdfd71 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1748,6 +1748,7 @@ async def init_app_state( reasoning_parser=args.reasoning_parser, enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_force_include_usage=args.enable_force_include_usage, + enable_log_outputs=args.enable_log_outputs, ) if "generate" in supported_tasks else None state.openai_serving_chat = OpenAIServingChat( engine_client, @@ -1765,6 +1766,7 @@ async def init_app_state( reasoning_parser=args.reasoning_parser, enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_force_include_usage=args.enable_force_include_usage, + enable_log_outputs=args.enable_log_outputs, ) if "generate" in supported_tasks else None state.openai_serving_completion = OpenAIServingCompletion( engine_client, From ebd5a77bb5a6b7643f047f61294da0ce92baf3f6 Mon Sep 17 00:00:00 2001 From: Guillaume Calmettes <gcalmettes@scaleway.com> Date: Tue, 26 Aug 2025 14:26:26 +0200 Subject: [PATCH 029/112] feat: add usage to TranscriptionResponse (text and json response_format) (#23576) Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com> --- .../openai/test_transcription_validation.py | 14 ++++++++++---- vllm/entrypoints/openai/protocol.py | 6 ++++++ vllm/entrypoints/openai/speech_to_text.py | 17 ++++++++++++++++- 3 files changed, 32 insertions(+), 5 deletions(-) diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py index 93239f41a4aeb..6009d9aeec935 100644 --- a/tests/entrypoints/openai/test_transcription_validation.py +++ b/tests/entrypoints/openai/test_transcription_validation.py @@ -69,8 +69,11 @@ async def test_basic_audio(mary_had_lamb, model_name): language="en", response_format="text", temperature=0.0) - out = json.loads(transcription)['text'] - assert "Mary had a little lamb," in out + out = json.loads(transcription) + out_text = out['text'] + out_usage = out['usage'] + assert "Mary had a little lamb," in out_text + assert out_usage["seconds"] == 16, out_usage["seconds"] @pytest.mark.asyncio @@ -116,9 +119,12 @@ async def test_long_audio_request(mary_had_lamb, client): language="en", response_format="text", temperature=0.0) - out = json.loads(transcription)['text'] - counts = out.count("Mary had a little lamb") + out = json.loads(transcription) + out_text = out['text'] + out_usage = out['usage'] + counts = out_text.count("Mary had a little lamb") assert counts == 10, counts + assert out_usage["seconds"] == 161, out_usage["seconds"] @pytest.mark.asyncio diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index a3d7b78cf4552..5cb41bd93d4bc 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -2232,9 +2232,15 @@ class TranscriptionRequest(OpenAIBaseModel): # Transcription response objects +class TranscriptionUsageAudio(OpenAIBaseModel): + type: Literal["duration"] = "duration" + seconds: int + + class TranscriptionResponse(OpenAIBaseModel): text: str """The transcribed text.""" + usage: TranscriptionUsageAudio class TranscriptionWord(OpenAIBaseModel): diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py index 01140a4bfea7e..de2619a78f8e0 100644 --- a/vllm/entrypoints/openai/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text.py @@ -200,7 +200,22 @@ class OpenAISpeechToText(OpenAIServing): for result_generator in list_result_generator: async for op in result_generator: text += op.outputs[0].text - return cast(T, response_class(text=text)) + + if self.task_type == "transcribe": + # add usage in TranscriptionResponse. + usage = { + "type": "duration", + # rounded up as per openAI specs + "seconds": int(math.ceil(duration_s)), + } + final_response = cast(T, response_class(text=text, + usage=usage)) + else: + # no usage in response for translation task + final_response = cast( + T, response_class(text=text)) # type: ignore[call-arg] + + return final_response except asyncio.CancelledError: return self.create_error_response("Client disconnected") except ValueError as e: From 2b4fc9bd9b8321265ff54065ea47bd9e327c6b6f Mon Sep 17 00:00:00 2001 From: Chen Zhang <zhangch99@outlook.com> Date: Tue, 26 Aug 2025 05:41:52 -0700 Subject: [PATCH 030/112] Support FlashAttention Backend for Hybrid SSM Models (#23299) Signed-off-by: Chen Zhang <zhangch99@outlook.com> --- .../models/language/generation/test_hybrid.py | 3 -- vllm/v1/worker/gpu_model_runner.py | 41 ++++++++----------- 2 files changed, 17 insertions(+), 27 deletions(-) diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index 2055c44c83cda..7e7cc893ec8aa 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -110,9 +110,6 @@ def test_models( if model in V1_SUPPORTED_MODELS: with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") - if model in HYBRID_MODELS: - # required due to reorder_batch behaviour - m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER") with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS, enable_prefix_caching=False) as vllm_model: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 4f6cf9a350706..14f2305dadc54 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3023,40 +3023,33 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): raise NotImplementedError if has_attn and has_mamba: - self._verify_hybrid_attention_mamba_layout(kv_cache_config, - kv_cache_raw_tensors) + self._update_hybrid_attention_mamba_layout(kv_caches) return kv_caches - def _verify_hybrid_attention_mamba_layout( - self, kv_cache_config: KVCacheConfig, - kv_cache_raw_tensors: dict[str, torch.Tensor]) -> None: + def _update_hybrid_attention_mamba_layout( + self, kv_caches: dict[str, torch.Tensor]) -> None: """ - Verify that the KV cache memory layout is compatible for - models with both attention and mamba KV cache groups. + Update the layout of attention layers from (2, num_blocks, ...) to + (num_blocks, 2, ...). Args: - kv_cache_config: The KV cache config - kv_cache_raw_tensors: The KV cache buffer of each layer. + kv_caches: The KV cache buffer of each layer. """ for kv_cache_spec, group in self._kv_cache_spec_attn_group_iterator(): for layer_name in group.layer_names: - raw_tensor = kv_cache_raw_tensors[layer_name] - num_blocks = (raw_tensor.numel() // - kv_cache_spec.page_size_bytes) - if isinstance(kv_cache_spec, AttentionSpec): - - kv_cache_shape = group.backend.get_kv_cache_shape( - num_blocks, kv_cache_spec.block_size, - kv_cache_spec.num_kv_heads, kv_cache_spec.head_size) - if kv_cache_shape[0] != num_blocks or kv_cache_shape[ - 1] != 2: - raise ValueError( - "Hybrid models in V1 require an attention " - "backend with kv_cache_shape=" - "(num_blocks, 2, ...). Please try setting " - "VLLM_ATTENTION_BACKEND=FLASHINFER") + kv_cache = kv_caches[layer_name] + if (isinstance(kv_cache_spec, AttentionSpec) + and kv_cache.shape[0] == 2): + assert kv_cache.shape[1] != 2, \ + "Fail to determine whether the layout is " \ + "(2, num_blocks, ...) or (num_blocks, 2, ...) for " \ + f"a tensor of shape {kv_cache.shape}" + hidden_size = kv_cache.shape[2:].numel() + kv_cache.as_strided_(size=kv_cache.shape, + stride=(hidden_size, 2 * hidden_size, + *kv_cache.stride()[2:])) def initialize_kv_cache_tensors( self, kv_cache_config: KVCacheConfig) -> dict[str, torch.Tensor]: From 164b2273c87ad72b2d3b1f2762367de42d6e946b Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 26 Aug 2025 14:00:18 +0100 Subject: [PATCH 031/112] [Docs] Fix broken links to `docs/api/summary.md` (#23637) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/examples/README.md | 6 +++--- docs/models/generative_models.md | 2 +- docs/models/pooling_models.md | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/examples/README.md b/docs/examples/README.md index 34e4dfd408a20..3cf93027f4209 100644 --- a/docs/examples/README.md +++ b/docs/examples/README.md @@ -2,6 +2,6 @@ vLLM's examples are split into three categories: -- If you are using vLLM from within Python code, see [Offline Inference](./offline_inference/) -- If you are using vLLM from an HTTP application or client, see [Online Serving](./online_serving/) -- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see [Others](./others/) +- If you are using vLLM from within Python code, see [Offline Inference](./offline_inference) +- If you are using vLLM from an HTTP application or client, see [Online Serving](./online_serving) +- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see [Others](./others) diff --git a/docs/models/generative_models.md b/docs/models/generative_models.md index a64ecd31ebaef..d02522a6657de 100644 --- a/docs/models/generative_models.md +++ b/docs/models/generative_models.md @@ -19,7 +19,7 @@ Run a model in generation mode via the option `--runner generate`. ## Offline Inference The [LLM][vllm.LLM] class provides various methods for offline inference. -See [configuration](../api/summary.md#configuration) for a list of options when initializing the model. +See [configuration](../api/README.md#configuration) for a list of options when initializing the model. ### `LLM.generate` diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index 753d8bd0b8339..fbb5f6f6dd171 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -81,7 +81,7 @@ which takes priority over both the model's and Sentence Transformers's defaults. ## Offline Inference The [LLM][vllm.LLM] class provides various methods for offline inference. -See [configuration](../api/summary.md#configuration) for a list of options when initializing the model. +See [configuration](../api/README.md#configuration) for a list of options when initializing the model. ### `LLM.embed` From b78bed1bc5debead116092f429eee51398691fc8 Mon Sep 17 00:00:00 2001 From: En Ouyang <en.ouyang93@outlook.com> Date: Tue, 26 Aug 2025 21:04:25 +0800 Subject: [PATCH 032/112] [Hardware][Mac] Fix the installation fail for Apple Silicon (CPU) (#23565) Signed-off-by: oye93 <en.ouyang93@outlook.com> Co-authored-by: Li, Jiang <jiang1.li@intel.com> --- cmake/cpu_extension.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index cc38cd41a5b24..52bfd82c7fcfe 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -1,6 +1,7 @@ include(FetchContent) set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_EXTENSIONS ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) From f66673a39d9f364194c249f28098cad8a5584ccb Mon Sep 17 00:00:00 2001 From: nvjullin <jullin@nvidia.com> Date: Tue, 26 Aug 2025 21:54:04 +0800 Subject: [PATCH 033/112] [Kernel] Added flashinfer fp8 per-tensor gemms (#22895) Signed-off-by: Julien Lin <jullin@nvidia.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> --- .buildkite/test-pipeline.yaml | 1 + tests/compile/test_fusion.py | 15 ++-- tests/compile/test_sequence_parallelism.py | 3 +- tests/compile/test_silu_mul_quant_fusion.py | 13 ++-- .../quantization/test_flashinfer_scaled_mm.py | 73 +++++++++++++++++++ .../model_executor/layers/quantization/fp8.py | 5 +- .../layers/quantization/ptpc_fp8.py | 4 +- .../layers/quantization/utils/w8a8_utils.py | 59 +++++++++++---- vllm/utils/flashinfer.py | 61 ++++++++++++++++ 9 files changed, 198 insertions(+), 36 deletions(-) create mode 100644 tests/kernels/quantization/test_flashinfer_scaled_mm.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 1ccfa93c571ce..0d3b7a294d963 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -655,6 +655,7 @@ steps: - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py + - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py - pytest -v -s tests/kernels/moe/test_mxfp4_moe.py diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py index 5cfad935a0fb1..c4229f93464ac 100644 --- a/tests/compile/test_fusion.py +++ b/tests/compile/test_fusion.py @@ -15,7 +15,7 @@ from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.quantization.utils.quant_utils import ( GroupShape, QuantKey, ScaleDesc) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( - CUTLASS_FP8_SUPPORTED, Fp8LinearOp, maybe_create_device_identity) + Fp8LinearOp, maybe_create_device_identity) from vllm.platforms import current_platform from .backend import TestBackend @@ -26,9 +26,9 @@ FP8_DTYPE = current_platform.fp8_dtype() class TestModel(torch.nn.Module): def __init__(self, hidden_size: int, eps: float, static: bool, - cutlass_fp8_enabled: bool, *args, **kwargs): + force_fp8_e4m3fnuz: bool, *args, **kwargs): super().__init__(*args, **kwargs) - self.cutlass_fp8_enabled = cutlass_fp8_enabled + self.force_fp8_e4m3fnuz = force_fp8_e4m3fnuz self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)] self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)] group_shape = GroupShape.PER_TENSOR if static else GroupShape.PER_TOKEN @@ -43,7 +43,7 @@ class TestModel(torch.nn.Module): for _ in range(2) ] self.fp8_linear = Fp8LinearOp( - cutlass_fp8_supported=cutlass_fp8_enabled, + force_fp8_e4m3fnuz=force_fp8_e4m3fnuz, act_quant_static=static, act_quant_group_shape=group_shape, ) @@ -81,12 +81,11 @@ class TestModel(torch.nn.Module): @pytest.mark.parametrize("num_tokens", [7, 256, 533, 2048, 2049]) @pytest.mark.parametrize("eps", [1e-5, 1e-6]) @pytest.mark.parametrize("static", [True, False]) -@pytest.mark.parametrize("cutlass_fp8_enabled", - [True, False] if CUTLASS_FP8_SUPPORTED else [False]) +@pytest.mark.parametrize("force_fp8_e4m3fnuz", [True, False]) @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"], reason="Only test on CUDA and ROCm") def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static, - cutlass_fp8_enabled): + force_fp8_e4m3fnuz): torch.set_default_device("cuda") torch.set_default_dtype(dtype) torch.manual_seed(1) @@ -103,7 +102,7 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static, fusion_pass = FusionPass.instance(vllm_config) backend = TestBackend(noop_pass, fusion_pass) - model = TestModel(hidden_size, eps, static, cutlass_fp8_enabled) + model = TestModel(hidden_size, eps, static, force_fp8_e4m3fnuz) # First dimension dynamic x = torch.rand(num_tokens, hidden_size) diff --git a/tests/compile/test_sequence_parallelism.py b/tests/compile/test_sequence_parallelism.py index a6baa97fe6990..fb9f9dde22799 100644 --- a/tests/compile/test_sequence_parallelism.py +++ b/tests/compile/test_sequence_parallelism.py @@ -104,8 +104,7 @@ class TestQuantModel(torch.nn.Module): # Initialize weights torch.nn.init.normal_(self.gate_proj, std=0.02) - self.fp8_linear = Fp8LinearOp(cutlass_fp8_supported=True, - use_per_token_if_dynamic=False) + self.fp8_linear = Fp8LinearOp(use_per_token_if_dynamic=False) self.scale = torch.rand(1, dtype=torch.float32) # Create a weight that is compatible with torch._scaled_mm, diff --git a/tests/compile/test_silu_mul_quant_fusion.py b/tests/compile/test_silu_mul_quant_fusion.py index 5351a3cf35ba5..0e1059e654479 100644 --- a/tests/compile/test_silu_mul_quant_fusion.py +++ b/tests/compile/test_silu_mul_quant_fusion.py @@ -12,7 +12,7 @@ from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.quantization.utils.quant_utils import ( GroupShape) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( - CUTLASS_FP8_SUPPORTED, Fp8LinearOp) + Fp8LinearOp) from vllm.platforms import current_platform from .backend import TestBackend @@ -20,7 +20,7 @@ from .backend import TestBackend class TestModel(torch.nn.Module): - def __init__(self, hidden_size: int, cutlass_fp8_enabled: bool, *args, + def __init__(self, hidden_size: int, force_fp8_e4m3fnuz: bool, *args, **kwargs): super().__init__(*args, **kwargs) self.silu_and_mul = SiluAndMul() @@ -32,7 +32,7 @@ class TestModel(torch.nn.Module): hidden_size).to(dtype=current_platform.fp8_dtype()).t()) self.fp8_linear = Fp8LinearOp( - cutlass_fp8_supported=cutlass_fp8_enabled, + force_fp8_e4m3fnuz=force_fp8_e4m3fnuz, act_quant_static=True, act_quant_group_shape=GroupShape.PER_TENSOR, ) @@ -48,12 +48,11 @@ class TestModel(torch.nn.Module): @pytest.mark.parametrize("num_tokens", [256]) @pytest.mark.parametrize("hidden_size", [64]) -@pytest.mark.parametrize("cutlass_fp8_enabled", - [True, False] if CUTLASS_FP8_SUPPORTED else [False]) +@pytest.mark.parametrize("force_fp8_e4m3fnuz", [True, False]) @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"], reason="Only test on CUDA and ROCm") def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, - cutlass_fp8_enabled): + force_fp8_e4m3fnuz): torch.set_default_device("cuda") torch.set_default_dtype(torch.float16) @@ -64,7 +63,7 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, fusion_pass = ActivationQuantFusionPass(config) backend = TestBackend(NoOpEliminationPass(config), fusion_pass) - model = TestModel(hidden_size, cutlass_fp8_enabled) + model = TestModel(hidden_size, force_fp8_e4m3fnuz) # First dimension dynamic x = torch.rand(num_tokens, hidden_size * 2) diff --git a/tests/kernels/quantization/test_flashinfer_scaled_mm.py b/tests/kernels/quantization/test_flashinfer_scaled_mm.py new file mode 100644 index 0000000000000..9f669c6df8bd5 --- /dev/null +++ b/tests/kernels/quantization/test_flashinfer_scaled_mm.py @@ -0,0 +1,73 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest +import torch + +from vllm import _custom_ops as ops +from vllm.platforms import current_platform +from vllm.utils.flashinfer import flashinfer_scaled_fp8_mm + +if not current_platform.has_device_capability(100): + pytest.skip( + reason= + "Flashinfer FP8 gemms requires compute capability of 10.0 or above.", + allow_module_level=True, + ) + +DTYPES = [torch.float16, torch.bfloat16] +# m, n, k +SHAPES = [(128, 128, 64), (128, 128, 128), (256, 128, 64), (128, 256, 128)] +PAD_SHAPES = [(150, 128, 64), (128, 128, 96)] +SHAPES.extend(PAD_SHAPES) + +SEEDS = [42] +CUDA_DEVICES = ["cuda:0"] + + +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("shape", SHAPES) +@pytest.mark.parametrize("use_bias", [True, False]) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("autotune", [False, True]) +@torch.inference_mode() +def test_flashinfer_fp8_gemm( + dtype: torch.dtype, + shape: tuple[int, int, int], + use_bias: bool, + seed: int, + device: str, + autotune: bool, +) -> None: + current_platform.seed_everything(seed) + m, n, k = shape + a = torch.randn((m, k), dtype=dtype, device=device) + b = torch.randn((n, k), dtype=dtype, device=device) / k + + a_fp8, a_scale = ops.scaled_fp8_quant(a) + b_fp8, b_scale = ops.scaled_fp8_quant(b) + + expected_out = torch.mm( + a_scale * a_fp8.to(dtype=torch.float32), + b_scale * b_fp8.to(dtype=torch.float32).t(), + ).to(dtype=dtype) + + if use_bias: + bias = torch.randn((n, ), dtype=dtype, device=device) + expected_out = expected_out + bias + else: + bias = None + + import flashinfer + + with flashinfer.autotune(autotune): + out = flashinfer_scaled_fp8_mm( + a_fp8, + b_fp8.t(), + a_scale, + b_scale, + dtype, + bias=bias, + ) + + torch.testing.assert_close(out, expected_out, atol=1e-2, rtol=1e-2) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index a4de4d7094c30..d45d368b582df 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -223,8 +223,7 @@ class Fp8LinearMethod(LinearMethodBase): self.fp8_linear = Fp8LinearOp( act_quant_static=self.act_q_static, - act_quant_group_shape=self.act_q_group_shape, - cutlass_fp8_supported=cutlass_fp8_supported()) + act_quant_group_shape=self.act_q_group_shape) def create_weights( self, @@ -376,6 +375,8 @@ class Fp8LinearMethod(LinearMethodBase): # Update the layer with the new values. layer.weight = Parameter(qweight.t(), requires_grad=False) layer.weight_scale = Parameter(weight_scale, requires_grad=False) + # layer.input_scale is None indicates dynamic quant and scale is + # computed from input. layer.input_scale = None # If checkpoint is fp8, handle that there are N scales for N diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py index d11cba2caba88..466fd5fba7685 100644 --- a/vllm/model_executor/layers/quantization/ptpc_fp8.py +++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py @@ -97,8 +97,8 @@ class PTPCFp8LinearMethod(Fp8LinearMethod): self.quant_config.is_checkpoint_fp8_serialized = False self.fp8_linear = Fp8LinearOp( act_quant_static=False, - cutlass_fp8_supported=False, - act_quant_group_shape=GroupShape.PER_TOKEN) + act_quant_group_shape=GroupShape.PER_TOKEN, + force_fp8_e4m3fnuz=True) def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer.weight = torch.nn.Parameter(layer.weight.data, diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index 36d16960ec57c..5333bbd310ff9 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -14,6 +14,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( GroupShape) from vllm.platforms import current_platform from vllm.utils import direct_register_custom_op +from vllm.utils.flashinfer import flashinfer_scaled_fp8_mm, has_flashinfer # Input scaling factors are no longer optional in _scaled_mm starting # from pytorch 2.5. Allocating a dummy tensor to pass as input_scale @@ -157,6 +158,19 @@ def cutlass_w8a8_scaled_mm(*, qinput: torch.Tensor, weight: torch.Tensor, return output.view(*output_shape) +def flashinfer_w8a8_scaled_mm(*, qinput: torch.Tensor, weight: torch.Tensor, + out_dtype: torch.dtype, scale_a: torch.Tensor, + scale_b: torch.Tensor, bias: torch.Tensor, + output_shape: list, **kwargs) -> torch.Tensor: + + return flashinfer_scaled_fp8_mm(qinput, + weight, + out_dtype=out_dtype, + scale_a=scale_a, + scale_b=scale_b, + bias=bias) + + def rocm_per_tensor_w8a8_scaled_mm_impl( qinput: torch.Tensor, weight: torch.Tensor, out_dtype: torch.dtype, scale_a: torch.Tensor, scale_b: torch.Tensor, bias: torch.Tensor, @@ -231,8 +245,8 @@ def torch_per_token_w8a8_scaled_mm(*, qinput: torch.Tensor, out_dtype: torch.dtype, scale_a: torch.Tensor, scale_b: torch.Tensor, bias: torch.Tensor, - input_2d: torch.Tensor, - output_shape: list) -> torch.Tensor: + input_2d: torch.Tensor, output_shape: list, + **kwargs) -> torch.Tensor: # Note: Callers of this function should check USE_ROWWISE_TORCH_SCALED_MM # when using it. # For now it has only been validated on ROCm platform. @@ -303,16 +317,22 @@ def torch_channelwise_w8a8_scaled_mm(*, qinput: torch.Tensor, def dispatch_w8a8_scaled_mm( - cutlass_fp8_supported: bool, per_tensor_weights: bool, + preferred_backend: str, per_tensor_weights: bool, per_tensor_activations: bool) -> Callable[..., torch.Tensor]: - # cutlass_scaled_mm supports per tensor/channel W and per tensor/token A - if cutlass_fp8_supported: - return cutlass_w8a8_scaled_mm if per_tensor_weights and per_tensor_activations: - if current_platform.is_rocm(): + if preferred_backend == "rocm": return rocm_per_tensor_w8a8_scaled_mm + if preferred_backend == "flashinfer": + return flashinfer_w8a8_scaled_mm + if preferred_backend == "cutlass": + return cutlass_w8a8_scaled_mm return torch_per_tensor_w8a8_scaled_mm + + # cutlass_scaled_mm supports per tensor/channel W and per tensor/token A + if preferred_backend == "cutlass" or preferred_backend == "flashinfer": + return cutlass_w8a8_scaled_mm + # If torch.scaled_mm supports per-channel (weights) per-token (inputs) if not per_tensor_weights and not per_tensor_activations \ and USE_ROWWISE_TORCH_SCALED_MM: @@ -334,10 +354,20 @@ class Fp8LinearOp: def __init__(self, act_quant_static: bool, - cutlass_fp8_supported: bool = cutlass_fp8_supported(), act_quant_group_shape: GroupShape = GroupShape.PER_TENSOR, - pad_output: Optional[bool] = None): - self.cutlass_fp8_supported = cutlass_fp8_supported + pad_output: Optional[bool] = None, + force_fp8_e4m3fnuz: bool = False): + if current_platform.is_rocm(): + self.preferred_backend = "rocm" + elif current_platform.is_cuda( + ) and not force_fp8_e4m3fnuz and cutlass_fp8_supported(): + if has_flashinfer() and current_platform.has_device_capability( + 100): + self.preferred_backend = "flashinfer" + else: + self.preferred_backend = "cutlass" + else: + self.preferred_backend = "torch" # Note: we pad the input because torch._scaled_mm is more performant # for matrices with batch dimension > 16. @@ -347,8 +377,7 @@ class Fp8LinearOp: if pad_output is None: config = get_current_vllm_config().compilation_config pad_output = config.level < CompilationLevel.PIECEWISE and \ - not cutlass_fp8_supported and \ - not current_platform.is_rocm() + self.preferred_backend == "torch" self.output_padding = 17 if pad_output else None self.act_quant_static = act_quant_static @@ -393,9 +422,9 @@ class Fp8LinearOp: per_tensor_activations = (x_scale.numel() == 1) # TODO(luka) do this dispatch during init (after ScaledMM refactor) - w8a8_scaled_mm_func = dispatch_w8a8_scaled_mm( - self.cutlass_fp8_supported, per_tensor_weights, - per_tensor_activations) + w8a8_scaled_mm_func = dispatch_w8a8_scaled_mm(self.preferred_backend, + per_tensor_weights, + per_tensor_activations) return w8a8_scaled_mm_func(qinput=qinput, weight=weight, diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index 5dd239c50f637..fab134733d4fd 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -265,6 +265,37 @@ if has_flashinfer(): dtype=dtype, device=A.device) + @torch.library.custom_op( + "vllm::bmm_fp8", + mutates_args=[], + device_types="cuda", + ) + def bmm_fp8( + A: torch.Tensor, + B: torch.Tensor, + A_scale: torch.Tensor, + B_scale: torch.Tensor, + dtype: torch.dtype, + backend: str, + ) -> torch.Tensor: + from flashinfer import bmm_fp8 as bmm_fp8_ + return bmm_fp8_(A, B, A_scale, B_scale, dtype, None, backend) + + @torch.library.register_fake("vllm::bmm_fp8", ) + def bmm_fp8_fake( + A: torch.Tensor, + B: torch.Tensor, + A_scale: torch.Tensor, + B_scale: torch.Tensor, + dtype: torch.dtype, + backend: str, + ) -> torch.Tensor: + return torch.empty(A.shape[0], + A.shape[1], + B.shape[2], + dtype=dtype, + device=A.device) + def flashinfer_scaled_fp4_mm(a: torch.Tensor, b: torch.Tensor, block_scale_a: torch.Tensor, @@ -293,6 +324,35 @@ def flashinfer_scaled_fp4_mm(a: torch.Tensor, b: torch.Tensor, ) +def flashinfer_scaled_fp8_mm( + a: torch.Tensor, + b: torch.Tensor, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + out_dtype: torch.dtype, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + assert a.ndim == 2 and b.ndim == 2 + assert a.shape[1] == b.shape[0] + assert scale_a.numel() == 1 and scale_b.numel() == 1 + assert a.dtype == torch.float8_e4m3fn and b.dtype == torch.float8_e4m3fn + assert a.device.type == "cuda" and b.device.type == "cuda" + assert scale_a.dtype == torch.float32 and scale_b.dtype == torch.float32 + assert scale_a.device.type == "cuda" and scale_b.device.type == "cuda" + + output = bmm_fp8( + a.unsqueeze(0), + b.unsqueeze(0), + scale_a, + scale_b, + out_dtype, + "auto", + ).view(a.shape[0], b.shape[1]) + + if bias is not None: + output = output + bias + return output + + __all__ = [ "has_flashinfer", "flashinfer_trtllm_fp8_block_scale_moe", @@ -307,4 +367,5 @@ __all__ = [ "supports_trtllm_attention", "use_trtllm_attention", "flashinfer_scaled_fp4_mm", + "flashinfer_scaled_fp8_mm", ] From 7c04779afa7d0811dba3e1ec98c0ac1bc56570be Mon Sep 17 00:00:00 2001 From: Didier Durand <2927957+didier-durand@users.noreply.github.com> Date: Tue, 26 Aug 2025 16:05:29 +0200 Subject: [PATCH 034/112] [Doc]: fix various spelling issues in multiple files (#23636) Signed-off-by: Didier Durand <durand.didier@gmail.com> --- .buildkite/nightly-benchmarks/README.md | 2 +- benchmarks/README.md | 2 +- docs/configuration/optimization.md | 4 ++-- docs/configuration/tpu.md | 2 +- docs/design/fused_moe_modular_kernel.md | 6 +++--- vllm/distributed/kv_transfer/README.md | 4 ++-- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md index b39f9899a8f28..e6f5c8b60f459 100644 --- a/.buildkite/nightly-benchmarks/README.md +++ b/.buildkite/nightly-benchmarks/README.md @@ -141,7 +141,7 @@ When run, benchmark script generates results under `benchmark/results` folder, a `compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT. If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead. -Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output lenght, max concurrency and qps. +Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output length, max concurrency and qps. `python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json` | | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio | diff --git a/benchmarks/README.md b/benchmarks/README.md index a2dd5bb58325c..38072152b653b 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -749,7 +749,7 @@ vllm serve Qwen/Qwen2.5-VL-3B-Instruct \ Benchmark. It is recommended to use the flag `--ignore-eos` to simulate real responses. You can set the size of the output via the arg `random-output-len`. -Ex.1: Fixed number of items and a single image resolutionm, enforcing generation of approx 40 tokens: +Ex.1: Fixed number of items and a single image resolution, enforcing generation of approx 40 tokens: ```bash vllm bench serve \ diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index 6c7c31f503c15..bb47e1b90f086 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -168,7 +168,7 @@ llm = LLM( Batch-level DP is not to be confused with API request-level DP (which is instead controlled by `data_parallel_size`). -The availablilty of batch-level DP is based on model implementation. +The availability of batch-level DP is based on model implementation. Currently, the following models support `mm_encoder_tp_mode="data"`: - Llama4 (<gh-pr:18368>) @@ -205,7 +205,7 @@ vllm serve Qwen/Qwen2.5-VL-3B-Instruct --api-server-count 4 -dp 2 !!! note [Multi-modal processor cache](#processor-cache) is disabled when API server scale-out is enabled - because it requires a one-to-one correspondance between API and engine core processes. + because it requires a one-to-one correspondence between API and engine core processes. ## Multi-Modal Caching diff --git a/docs/configuration/tpu.md b/docs/configuration/tpu.md index a93435ed71b50..ac2b6baffd14e 100644 --- a/docs/configuration/tpu.md +++ b/docs/configuration/tpu.md @@ -70,7 +70,7 @@ For example, max_model_len=512, padding_gap=64, the buckets will be [16, 32, 64, The fewer tokens we pad, the less unnecessary computation TPU does, the better performance we can get. For example, if num_tokens=300, with exponential padding, we pad to 512, with the bucket_padding above, we pad to 320. -However, you need to be careful to choose the padding gap. If the gap is too small, it means the number of buckets is large, leading to increased warmup (precompile) time and higher memory to store the compiled graph. Too many compilaed graphs may lead to HBM OOM. Conversely, an overly large gap yields no performance improvement compared to the default exponential padding. +However, you need to be careful to choose the padding gap. If the gap is too small, it means the number of buckets is large, leading to increased warmup (precompile) time and higher memory to store the compiled graph. Too many compiled graphs may lead to HBM OOM. Conversely, an overly large gap yields no performance improvement compared to the default exponential padding. #### Quantization diff --git a/docs/design/fused_moe_modular_kernel.md b/docs/design/fused_moe_modular_kernel.md index 3c4c7d2102170..202e9c1caf113 100644 --- a/docs/design/fused_moe_modular_kernel.md +++ b/docs/design/fused_moe_modular_kernel.md @@ -133,7 +133,7 @@ class FusedMoEModularKernel: Typically a FusedMoEPrepareAndFinalize type is backed by an All2All Dispatch & Combine implementation / kernel. For example, * PplxPrepareAndFinalize type is backed by Pplx All2All kernels, -* DeepEPHTPrepareAndFinalize type is backed by DeepEP High-Throughtput All2All kernels, and +* DeepEPHTPrepareAndFinalize type is backed by DeepEP High-Throughput All2All kernels, and * DeepEPLLPrepareAndFinalize type is backed by DeepEP Low-Latency All2All kernels. #### Step 1: Add an All2All manager @@ -183,7 +183,7 @@ implementations that input `FusedMoEActivationFormat.Standard` support chunking #### maybe_make_prepare_finalize -The `maybe_make_prepare_finalize` method is responsbile for constructing an instance of `FusedMoEPrepareAndFinalize` when appropriate based on the current all2all backend, e.g. when EP + DP is enabled. The base class method currently constructs all the `FusedMoEPrepareAndFinalize` objects for the EP+DP case. Derived classes can override this method to construct prepare/finalize objects for different scenarios, e.g. `ModelOptNvFp4FusedMoE` can construct a `FlashInferCutlassMoEPrepareAndFinalize` for the EP+TP case. +The `maybe_make_prepare_finalize` method is responsible for constructing an instance of `FusedMoEPrepareAndFinalize` when appropriate based on the current all2all backend, e.g. when EP + DP is enabled. The base class method currently constructs all the `FusedMoEPrepareAndFinalize` objects for the EP+DP case. Derived classes can override this method to construct prepare/finalize objects for different scenarios, e.g. `ModelOptNvFp4FusedMoE` can construct a `FlashInferCutlassMoEPrepareAndFinalize` for the EP+TP case. Please refer to the implementations in, * `ModelOptNvFp4FusedMoE` @@ -198,7 +198,7 @@ Please refer to the implementations in, * `CompressedTensorsW8A8Fp8MoECutlassMethod` * `Fp8MoEMethod` * `ModelOptNvFp4FusedMoE` -dervied classes. +derived classes. #### init_prepare_finalize diff --git a/vllm/distributed/kv_transfer/README.md b/vllm/distributed/kv_transfer/README.md index 349d3dfbd84fc..39377aabcce3a 100644 --- a/vllm/distributed/kv_transfer/README.md +++ b/vllm/distributed/kv_transfer/README.md @@ -2,7 +2,7 @@ # Distributed KV cache transfer This folder implements distributed KV cache transfer across vLLM instances. -Currently the main usecase is for disaggregated prefilling. +Currently the main use case is for disaggregated prefilling. ## Abstractions @@ -14,7 +14,7 @@ The KV cache transfer contains three layer of abstractions: Why we need KV lookup buffer: FIFO pipe itself is not enough as prefill vLLM worker may process requests in a different order compared to decode vLLM worker. Say the QPS is really high, prefill worker may handle requests in order A -> B -> C, but the decode worker may process request C first. This is not the case that can be naturally handled by FIFO pipe, so we provide KV lookup buffer to help translate a FIFO pipe to a lookup buffer. -NOTE: KV pipe layer is bypassible: you can skip this layer if your distributed +NOTE: KV pipe layer is bypassable: you can skip this layer if your distributed communication service already supports key-value-based lookup (like redis or RDMA database). From f58675bfb36b67cdbca4d2356a2f580e7a706ec3 Mon Sep 17 00:00:00 2001 From: TianyuLi0 <116711075+TianyuLi0@users.noreply.github.com> Date: Tue, 26 Aug 2025 22:09:17 +0800 Subject: [PATCH 035/112] [CPU] add cpu fused moe pytorch native implementation (#23146) Signed-off-by: Tianyu Li <tianyu.li@arm.com> Co-authored-by: Li, Jiang <jiang1.li@intel.com> --- .../layers/fused_moe/cpu_fused_moe.py | 286 +++++++++++------- vllm/model_executor/layers/fused_moe/layer.py | 4 +- 2 files changed, 180 insertions(+), 110 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py index e67ff66882102..769a04b7de89d 100644 --- a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py @@ -3,10 +3,110 @@ from typing import Callable, Optional import torch +from torch.nn import functional as F from vllm import envs +def silu_and_mul(x: torch.Tensor) -> torch.Tensor: + d = x.shape[-1] // 2 + return F.silu(x[..., :d]) * x[..., d:] + + +def grouped_topk( + hidden_states: torch.Tensor, + gating_output: torch.Tensor, + topk: int, + renormalize: bool, + num_expert_group: int = 0, + topk_group: int = 0, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None +) -> tuple[torch.Tensor, torch.Tensor]: + assert hidden_states.shape[0] == gating_output.shape[0], ( + "Number of tokens mismatch") + + gating_output = gating_output.float() + if scoring_func == "softmax": + scores = torch.softmax(gating_output, dim=-1) + elif scoring_func == "sigmoid": + scores = gating_output.sigmoid() + else: + raise ValueError(f"Unsupported scoring function: {scoring_func}") + + num_token = scores.shape[0] + if e_score_correction_bias is not None: + original_scores = scores + scores = scores + e_score_correction_bias.unsqueeze(0) + group_scores = (scores.view(num_token, num_expert_group, + -1).topk(2, dim=-1)[0].sum(dim=-1)) + else: + group_scores = scores.view(num_token, num_expert_group, + -1).max(dim=-1).values # [n, n_group] + group_idx = torch.topk(group_scores, k=topk_group, dim=-1, + sorted=False)[1] # [n, top_k_group] + group_mask = torch.zeros_like(group_scores) # [n, n_group] + group_mask.scatter_(1, group_idx, 1) # [n, n_group] + score_mask = group_mask.unsqueeze(-1).expand( + num_token, num_expert_group, + scores.shape[-1] // num_expert_group).reshape(num_token, -1) # [n, e] + tmp_scores = scores.masked_fill(~score_mask.bool(), + float("-inf")) # [n, e] + + if e_score_correction_bias is not None: + topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)[1] + topk_weights = original_scores.gather(1, topk_ids) + else: + topk_weights, topk_ids = torch.topk(tmp_scores, + k=topk, + dim=-1, + sorted=False) + + if renormalize: + topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) + + return topk_weights, topk_ids.to(torch.int32) + + +def select_experts( + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + use_grouped_topk: bool, + renormalize: bool, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None, +) -> tuple[torch.Tensor, torch.Tensor]: + if use_grouped_topk: + assert topk_group is not None + assert num_expert_group is not None + return grouped_topk(hidden_states=hidden_states, + gating_output=router_logits, + topk=top_k, + renormalize=renormalize, + num_expert_group=num_expert_group, + topk_group=topk_group, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias) + elif custom_routing_function is None: + assert scoring_func == "softmax" + topk_weights = torch.nn.functional.softmax(router_logits, + dim=1, + dtype=torch.float32) + topk_weights, topk_ids = torch.topk(topk_weights, top_k, dim=-1) + if renormalize: + topk_weights /= topk_weights.sum(dim=-1, keepdim=True) + return topk_weights, topk_ids.to(torch.int32) + else: + return custom_routing_function(hidden_states=hidden_states, + gating_output=router_logits, + topk=top_k, + renormalize=renormalize) + + class IPEXFusedMOE: def __init__(self, layer: torch.nn.Module) -> None: @@ -56,113 +156,6 @@ class SGLFusedMOE: def __init__(self, layer: torch.nn.Module) -> None: pass - @staticmethod - def _grouped_topk( - hidden_states: torch.Tensor, - gating_output: torch.Tensor, - topk: int, - renormalize: bool, - num_expert_group: int = 0, - topk_group: int = 0, - scoring_func: str = "softmax", - e_score_correction_bias: Optional[torch.Tensor] = None - ) -> tuple[torch.Tensor, torch.Tensor]: - assert hidden_states.shape[0] == gating_output.shape[0], ( - "Number of tokens mismatch") - - gating_output = gating_output.float() - if scoring_func == "softmax": - scores = torch.softmax(gating_output, dim=-1) - elif scoring_func == "sigmoid": - scores = gating_output.sigmoid() - else: - raise ValueError(f"Unsupported scoring function: {scoring_func}") - - num_token = scores.shape[0] - if e_score_correction_bias is not None: - # Store original scores before applying correction bias. We use - # biased scores for expert selection but original scores for - # routing weights - original_scores = scores - scores = scores + e_score_correction_bias.unsqueeze(0) - group_scores = (scores.view(num_token, num_expert_group, - -1).topk(2, dim=-1)[0].sum(dim=-1)) - else: - group_scores = scores.view(num_token, num_expert_group, - -1).max(dim=-1).values # [n, n_group] - group_idx = torch.topk(group_scores, - k=topk_group, - dim=-1, - sorted=False)[1] # [n, top_k_group] - group_mask = torch.zeros_like(group_scores) # [n, n_group] - group_mask.scatter_(1, group_idx, 1) # [n, n_group] - score_mask = group_mask.unsqueeze(-1).expand( - num_token, num_expert_group, - scores.shape[-1] // num_expert_group).reshape(num_token, - -1) # [n, e] - tmp_scores = scores.masked_fill(~score_mask.bool(), - float("-inf")) # [n, e] - - if e_score_correction_bias is not None: - topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)[1] - # Use original unbiased scores for the routing weights - topk_weights = original_scores.gather(1, topk_ids) - else: - topk_weights, topk_ids = torch.topk(tmp_scores, - k=topk, - dim=-1, - sorted=False) - - if renormalize: - topk_weights = topk_weights / topk_weights.sum(dim=-1, - keepdim=True) - - return topk_weights, topk_ids.to(torch.int32) - - @staticmethod - def _select_experts( - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - top_k: int, - use_grouped_topk: bool, - renormalize: bool, - topk_group: Optional[int] = None, - num_expert_group: Optional[int] = None, - custom_routing_function: Optional[Callable] = None, - scoring_func: str = "softmax", - e_score_correction_bias: Optional[torch.Tensor] = None, - ) -> tuple[torch.Tensor, torch.Tensor]: - # DeekSeekv2 uses grouped_top_k - if use_grouped_topk: - assert topk_group is not None - assert num_expert_group is not None - topk_weights, topk_ids = SGLFusedMOE._grouped_topk( - hidden_states=hidden_states, - gating_output=router_logits, - topk=top_k, - renormalize=renormalize, - num_expert_group=num_expert_group, - topk_group=topk_group, - scoring_func=scoring_func, - e_score_correction_bias=e_score_correction_bias) - elif custom_routing_function is None: - assert scoring_func == "softmax" - topk_weights = torch.nn.functional.softmax(router_logits, - dim=1, - dtype=torch.float32) - topk_weights, topk_ids = torch.topk(topk_weights, top_k, dim=-1) - if renormalize: - topk_weights /= topk_weights.sum(dim=-1, keepdim=True) - topk_ids = topk_ids.to(torch.int32) - else: - topk_weights, topk_ids = custom_routing_function( - hidden_states=hidden_states, - gating_output=router_logits, - topk=top_k, - renormalize=renormalize) - - return topk_weights, topk_ids - def __call__( self, layer: torch.nn.Module, @@ -183,7 +176,7 @@ class SGLFusedMOE: ) -> torch.Tensor: assert activation == "silu", f"{activation} is not supported." assert not apply_router_weight_on_input - topk_weights, topk_ids = SGLFusedMOE._select_experts( + topk_weights, topk_ids = select_experts( hidden_states=x, router_logits=router_logits, use_grouped_topk=use_grouped_topk, @@ -213,3 +206,80 @@ class SGLFusedMOE: True, ) return x + + +class CPUFusedMOE: + + def __init__(self, layer: torch.nn.Module) -> None: + pass + + def __call__( + self, + layer: torch.nn.Module, + x: torch.Tensor, + use_grouped_topk: bool, + top_k: int, + router_logits: torch.Tensor, + renormalize: bool, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + ) -> torch.Tensor: + assert activation == "silu", f"{activation} is not supported." + assert not apply_router_weight_on_input + topk_weights, topk_ids = select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias, + ) + + # Ref code from https://github.com/sgl-project/sglang/blob/716e682721397df103f347d22da8bd46c6016dab/python/sglang/srt/layers/moe/fused_moe_native.py#L53 + len_experts = global_num_experts + + cnts = topk_ids.new_zeros((topk_ids.shape[0], len_experts)) + cnts.scatter_(1, topk_ids.to(torch.int64), 1) + tokens_per_expert = cnts.sum(dim=0) + idxs = topk_ids.view(-1).argsort() + + sorted_tokens = x[idxs // topk_ids.shape[1]] + tokens_per_expert = tokens_per_expert.cpu().numpy() + + outputs = [] + start_idx = 0 + for i, num_tokens in enumerate(tokens_per_expert): + end_idx = start_idx + num_tokens + if num_tokens == 0: + continue + tokens_for_this_expert = sorted_tokens[start_idx:end_idx] + + layer_w13_weight = layer.w13_weight[i] + layer_w2_weight = layer.w2_weight[i] + + gate_up = F.linear(tokens_for_this_expert, layer_w13_weight) + gate_up = silu_and_mul(gate_up) + expert_out = F.linear(gate_up, layer_w2_weight) + outputs.append(expert_out) + start_idx = end_idx + + outs = torch.cat(outputs, + dim=0) if len(outputs) else sorted_tokens.new_empty(0) + new_x = torch.empty_like(outs) + + new_x[idxs] = outs + final_out = (new_x.view( + *topk_ids.shape, -1).type(topk_weights.dtype).mul_( + topk_weights.unsqueeze(dim=-1)).sum(dim=1).type(new_x.dtype)) + return final_out diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index fcc6987d26bb2..54406a5a2d87f 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -358,8 +358,8 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): use_prepack=True, ) elif current_platform.is_cpu(): + from vllm.model_executor.layers.fused_moe import cpu_fused_moe if current_platform.get_cpu_architecture() == CpuArchEnum.X86: - from vllm.model_executor.layers.fused_moe import cpu_fused_moe from vllm.model_executor.layers.utils import ( check_cpu_sgl_kernel) dtype_w13 = layer.w13_weight.dtype @@ -382,7 +382,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): else: layer.cpu_fused_moe = cpu_fused_moe.IPEXFusedMOE(layer) else: - raise NotImplementedError("CPU MOE only supports x86 arch.") + layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer) def apply( self, From 1fdc732419d9b9eb00e003f38d6e02c480131ac8 Mon Sep 17 00:00:00 2001 From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com> Date: Tue, 26 Aug 2025 10:32:37 -0400 Subject: [PATCH 036/112] [ROCm] Starting to add AMD code reviewers for ROCm components (#23496) Signed-off-by: Hongxia Yang <hongxia.yang@amd.com> --- .github/CODEOWNERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index ce9590f02ce71..c087fd555c661 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -79,4 +79,10 @@ mkdocs.yaml @hmellor /vllm/attention/ops/chunked_prefill_paged_decode.py @tdoublep /vllm/attention/ops/triton_unified_attention.py @tdoublep +# ROCm related: specify owner with write access to notify AMD folks for careful code review +/docker/Dockerfile.rocm* @gshtras +/vllm/v1/attention/backends/rocm*.py @gshtras +/vllm/v1/attention/backends/mla/rocm*.py @gshtras +/vllm/attention/ops/rocm*.py @gshtras +/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras From 379f828fba68bcafec8b283acfd2b831fc35afb9 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 26 Aug 2025 16:43:28 +0100 Subject: [PATCH 037/112] [Docs] Reduce requirements for docs build (#23651) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/mkdocs/hooks/generate_argparse.py | 52 +++++++++++++++++------ requirements/docs.txt | 14 ------- vllm/sequence.py | 7 +++- vllm/transformers_utils/config.py | 58 ++++++++++++-------------- 4 files changed, 72 insertions(+), 59 deletions(-) diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py index ed5d3b0092ae7..051a2d904406d 100644 --- a/docs/mkdocs/hooks/generate_argparse.py +++ b/docs/mkdocs/hooks/generate_argparse.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import importlib import logging import sys from argparse import SUPPRESS, HelpFormatter @@ -7,25 +8,52 @@ from pathlib import Path from typing import Literal from unittest.mock import MagicMock, patch +from pydantic_core import core_schema + +logger = logging.getLogger("mkdocs") + ROOT_DIR = Path(__file__).parent.parent.parent.parent ARGPARSE_DOC_DIR = ROOT_DIR / "docs/argparse" sys.path.insert(0, str(ROOT_DIR)) -sys.modules["aiohttp"] = MagicMock() -sys.modules["blake3"] = MagicMock() sys.modules["vllm._C"] = MagicMock() -from vllm.benchmarks import latency # noqa: E402 -from vllm.benchmarks import serve # noqa: E402 -from vllm.benchmarks import throughput # noqa: E402 -from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs # noqa: E402 -from vllm.entrypoints.cli.openai import ChatCommand # noqa: E402 -from vllm.entrypoints.cli.openai import CompleteCommand # noqa: E402 -from vllm.entrypoints.openai import cli_args # noqa: E402 -from vllm.entrypoints.openai import run_batch # noqa: E402 -from vllm.utils import FlexibleArgumentParser # noqa: E402 -logger = logging.getLogger("mkdocs") +class PydanticMagicMock(MagicMock): + """`MagicMock` that's able to generate pydantic-core schemas.""" + + def __get_pydantic_core_schema__(self, source_type, handler): + return core_schema.any_schema() + + +def auto_mock(module, attr, max_mocks=50): + """Function that automatically mocks missing modules during imports.""" + logger.info("Importing %s from %s", attr, module) + for _ in range(max_mocks): + try: + # First treat attr as an attr, then as a submodule + return getattr(importlib.import_module(module), attr, + importlib.import_module(f"{module}.{attr}")) + except importlib.metadata.PackageNotFoundError as e: + raise e + except ModuleNotFoundError as e: + logger.info("Mocking %s for argparse doc generation", e.name) + sys.modules[e.name] = PydanticMagicMock() + + raise ImportError( + f"Failed to import {module}.{attr} after mocking {max_mocks} imports") + + +latency = auto_mock("vllm.benchmarks", "latency") +serve = auto_mock("vllm.benchmarks", "serve") +throughput = auto_mock("vllm.benchmarks", "throughput") +AsyncEngineArgs = auto_mock("vllm.engine.arg_utils", "AsyncEngineArgs") +EngineArgs = auto_mock("vllm.engine.arg_utils", "EngineArgs") +ChatCommand = auto_mock("vllm.entrypoints.cli.openai", "ChatCommand") +CompleteCommand = auto_mock("vllm.entrypoints.cli.openai", "CompleteCommand") +cli_args = auto_mock("vllm.entrypoints.openai", "cli_args") +run_batch = auto_mock("vllm.entrypoints.openai", "run_batch") +FlexibleArgumentParser = auto_mock("vllm.utils", "FlexibleArgumentParser") class MarkdownFormatter(HelpFormatter): diff --git a/requirements/docs.txt b/requirements/docs.txt index a24b9c7e924bf..3b72a8a9e755e 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -14,20 +14,6 @@ ruff # Required for argparse hook only -f https://download.pytorch.org/whl/cpu cachetools -cbor2 -cloudpickle -fastapi msgspec -openai -openai-harmony -partial-json-parser -pillow -psutil -pybase64 pydantic -setproctitle torch -transformers -zmq -uvloop -prometheus-client diff --git a/vllm/sequence.py b/vllm/sequence.py index 43d5c8beef270..3c4c77aea5ed8 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -16,14 +16,17 @@ import msgspec import torch from vllm.inputs import SingletonInputs -from vllm.lora.request import LoRARequest from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict from vllm.pooling_params import PoolingParams from vllm.sampling_params import RequestOutputKind, SamplingParams if TYPE_CHECKING: + from vllm.lora.request import LoRARequest from vllm.v1.worker.kv_connector_model_runner_mixin import ( KVConnectorOutput) +else: + LoRARequest = Any + KVConnectorOutput = Any VLLM_TOKEN_ID_ARRAY_TYPE = "l" @@ -1138,7 +1141,7 @@ class IntermediateTensors: """ tensors: dict[str, torch.Tensor] - kv_connector_output: Optional["KVConnectorOutput"] + kv_connector_output: Optional[KVConnectorOutput] def __init__(self, tensors): # manually define this function, so that diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 674c820daba29..2cd799e5eb5a9 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -27,19 +27,6 @@ from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME from vllm import envs from vllm.logger import init_logger -# yapf conflicts with isort for this block -# yapf: disable -from vllm.transformers_utils.configs import (ChatGLMConfig, DeepseekVLV2Config, - EAGLEConfig, JAISConfig, - KimiVLConfig, MedusaConfig, - MLPSpeculatorConfig, - Nemotron_Nano_VL_Config, - NemotronConfig, OvisConfig, - RWConfig, SpeculatorsConfig, - Step3TextConfig, Step3VLConfig, - UltravoxConfig) -# yapf: enable -from vllm.transformers_utils.configs.mistral import adapt_config_dict from vllm.transformers_utils.utils import check_gguf_file if envs.VLLM_USE_MODELSCOPE: @@ -67,24 +54,31 @@ def _get_hf_token() -> Optional[str]: return None -_CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = { - "chatglm": ChatGLMConfig, - "deepseek_vl_v2": DeepseekVLV2Config, - "kimi_vl": KimiVLConfig, - "Llama_Nemotron_Nano_VL": Nemotron_Nano_VL_Config, - "RefinedWeb": RWConfig, # For tiiuae/falcon-40b(-instruct) - "RefinedWebModel": RWConfig, # For tiiuae/falcon-7b(-instruct) - "jais": JAISConfig, - "mlp_speculator": MLPSpeculatorConfig, - "medusa": MedusaConfig, - "eagle": EAGLEConfig, - "speculators": SpeculatorsConfig, - "nemotron": NemotronConfig, - "ovis": OvisConfig, - "ultravox": UltravoxConfig, - "step3_vl": Step3VLConfig, - "step3_text": Step3TextConfig, -} +class LazyConfigDict(dict): + + def __getitem__(self, key): + import vllm.transformers_utils.configs as configs + return getattr(configs, super().__getitem__(key)) + + +_CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict( + chatglm="ChatGLMConfig", + deepseek_vl_v2="DeepseekVLV2Config", + kimi_vl="KimiVLConfig", + Llama_Nemotron_Nano_VL="Nemotron_Nano_VL_Config", + RefinedWeb="RWConfig", # For tiiuae/falcon-40b(-instruct) + RefinedWebModel="RWConfig", # For tiiuae/falcon-7b(-instruct) + jais="JAISConfig", + mlp_speculator="MLPSpeculatorConfig", + medusa="MedusaConfig", + eagle="EAGLEConfig", + speculators="SpeculatorsConfig", + nemotron="NemotronConfig", + ovis="OvisConfig", + ultravox="UltravoxConfig", + step3_vl="Step3VLConfig", + step3_text="Step3TextConfig", +) _CONFIG_ATTRS_MAPPING: dict[str, str] = { "llm_config": "text_config", @@ -461,6 +455,8 @@ def get_config( model, revision, **kwargs) config_dict["max_position_embeddings"] = max_position_embeddings + from vllm.transformers_utils.configs.mistral import adapt_config_dict + config = adapt_config_dict(config_dict) # Mistral configs may define sliding_window as list[int]. Convert it From 513298f1b44157f7ae2f7007ef7b17c2929d11d4 Mon Sep 17 00:00:00 2001 From: Yuekai Zhang <zhangyuekai@foxmail.com> Date: Tue, 26 Aug 2025 23:47:50 +0800 Subject: [PATCH 038/112] [Bugfix] fix bf16 multimodal model hash (#23623) Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Roger Wang <hey@rogerw.io> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk> --- vllm/multimodal/hasher.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py index 210a4ec762879..479961776a6a0 100644 --- a/vllm/multimodal/hasher.py +++ b/vllm/multimodal/hasher.py @@ -43,7 +43,19 @@ class MultiModalHasher: return cls.item_to_bytes( "image", np.asarray(convert_image_mode(obj, "RGBA"))) if isinstance(obj, torch.Tensor): - return cls.item_to_bytes("tensor", obj.cpu().numpy()) + tensor_obj: torch.Tensor = obj.cpu() + tensor_dtype = tensor_obj.dtype + if tensor_dtype == torch.bfloat16: + tensor_obj = tensor_obj.contiguous() + tensor_obj = tensor_obj.view( + (tensor_obj.numel(), )).view(torch.uint8) + return cls.item_to_bytes( + "tensor", { + "original_dtype": str(tensor_dtype), + "original_shape": tuple(tensor_obj.shape), + "data": tensor_obj.numpy() + }) + return cls.item_to_bytes("tensor", tensor_obj.numpy()) if isinstance(obj, np.ndarray): # If the array is non-contiguous, we need to copy it first arr_data = obj.data if obj.flags.c_contiguous else obj.tobytes() From 9d4183dd2e751e94442d7f02966d33cc085de708 Mon Sep 17 00:00:00 2001 From: Yuekai Zhang <zhangyuekai@foxmail.com> Date: Tue, 26 Aug 2025 23:48:08 +0800 Subject: [PATCH 039/112] [model] support qwen2audio embedding input (#23625) Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- .../models/qwen2_5_omni_thinker.py | 13 ++- vllm/model_executor/models/qwen2_audio.py | 109 ++++++++++++++---- 2 files changed, 93 insertions(+), 29 deletions(-) diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index a61b8ca8f7ae7..5c64c81547e65 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -47,7 +47,7 @@ from vllm.model_executor.models.qwen2_5_vl import ( Qwen2_5_VLProcessingInfo, Qwen2_5_VLVideoEmbeddingInputs, Qwen2_5_VLVideoInputs, Qwen2_5_VLVideoPixelInputs) from vllm.model_executor.models.qwen2_audio import ( - Qwen2AudioInputs, Qwen2AudioProcessingInfo, + Qwen2AudioFeatureInputs, Qwen2AudioProcessingInfo, _get_feat_extract_output_lengths) from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalDataParser from vllm.model_executor.sampling_metadata import SamplingMetadata @@ -534,7 +534,7 @@ class Qwen2_5OmniConditionalGenerationMixin: return torch.concat(mm_input, dim=dim) def _parse_and_validate_audio_input( - self, **kwargs: object) -> Optional[Qwen2AudioInputs]: + self, **kwargs: object) -> Optional[Qwen2AudioFeatureInputs]: input_audio_features = kwargs.pop('input_audio_features', None) audio_feature_lengths = kwargs.pop('audio_feature_lengths', None) feature_attention_mask = kwargs.pop('feature_attention_mask', None) @@ -548,9 +548,10 @@ class Qwen2_5OmniConditionalGenerationMixin: if not isinstance(input_audio_features, (torch.Tensor, list)): raise ValueError("Incorrect type of audio input features. " f"Got type: {type(input_audio_features)}") - return Qwen2AudioInputs(input_features=input_audio_features, - audio_feature_lengths=audio_feature_lengths, - feature_attention_mask=feature_attention_mask) + return Qwen2AudioFeatureInputs( + input_features=input_audio_features, + audio_feature_lengths=audio_feature_lengths, + feature_attention_mask=feature_attention_mask) def _parse_and_validate_image_input( self, @@ -630,7 +631,7 @@ class Qwen2_5OmniConditionalGenerationMixin: def _process_audio_input( self, - audio_input: Qwen2AudioInputs, + audio_input: Qwen2AudioFeatureInputs, audio_hashes: list[str] = None, cached_audio_features: torch.Tensor = None, ) -> torch.Tensor: diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 86c567ca36174..86b4a9a018c76 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -23,7 +23,7 @@ # limitations under the License. """Inference-only Qwen2-Audio model compatible with HuggingFace weights.""" from collections.abc import Iterable, Mapping, Sequence -from typing import Any, Optional, TypedDict, Union +from typing import Any, Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -36,9 +36,11 @@ from transformers.models.whisper import WhisperFeatureExtractor from vllm.config import VllmConfig from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, +from vllm.multimodal.inputs import (AudioItem, ModalityData, + MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) -from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems, +from vllm.multimodal.parse import (AudioProcessorItems, DictEmbeddingItems, + ModalityDataItems, MultiModalDataItems, MultiModalDataParser) from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, @@ -52,7 +54,8 @@ from .utils import (AutoWeightsLoader, init_vllm_registered_model, # # === Audio Inputs === # -class Qwen2AudioInputs(TypedDict): +class Qwen2AudioFeatureInputs(TypedDict): + type: Literal["audio_features"] input_features: torch.Tensor """Shape: `(num_audios, num_mel_bins, 3000)`""" @@ -60,6 +63,16 @@ class Qwen2AudioInputs(TypedDict): """Shape: `(num_audios, 3000)`""" +class Qwen2AudioEmbeddingInputs(TypedDict): + type: Literal["audio_embeds"] + audio_embeds: list[torch.Tensor] + """Shape: `(num_audio_features, hidden_size)` + `hidden_size` must match the hidden size of language model backbone. + """ + + +Qwen2AudioInputs = Union[Qwen2AudioFeatureInputs, Qwen2AudioEmbeddingInputs] + # === Audio Encoder === # @@ -128,12 +141,38 @@ class Qwen2AudioDummyInputsBuilder( } +def _qwen2audio_field_config(hf_inputs: Mapping[str, torch.Tensor]): + return dict( + audio_embeds=MultiModalFieldConfig.batched("audio"), + input_features=MultiModalFieldConfig.batched("audio"), + feature_attention_mask=MultiModalFieldConfig.batched("audio"), + ) + + +class Qwen2AudioMultiModalDataParser(MultiModalDataParser): + + def _parse_audio_data( + self, + data: Union[dict[str, torch.Tensor], ModalityData[AudioItem]], + ) -> Optional[ModalityDataItems[Any, Any]]: + if isinstance(data, dict): + return DictEmbeddingItems( + data, + modality="audio", + required_fields={"audio_embeds"}, + fields_factory=_qwen2audio_field_config, + ) + + return super()._parse_audio_data(data) + + class Qwen2AudioMultiModalProcessor( BaseMultiModalProcessor[Qwen2AudioProcessingInfo]): def _get_data_parser(self) -> MultiModalDataParser: feature_extractor = self.info.get_feature_extractor() - return MultiModalDataParser(target_sr=feature_extractor.sampling_rate) + return Qwen2AudioMultiModalDataParser( + target_sr=feature_extractor.sampling_rate) def _call_hf_processor( self, @@ -173,10 +212,7 @@ class Qwen2AudioMultiModalProcessor( hf_inputs: BatchFeature, hf_processor_mm_kwargs: Mapping[str, object], ) -> Mapping[str, MultiModalFieldConfig]: - return dict( - input_features=MultiModalFieldConfig.batched("audio"), - feature_attention_mask=MultiModalFieldConfig.batched("audio"), - ) + return _qwen2audio_field_config(hf_inputs) def _get_prompt_updates( self, @@ -184,6 +220,7 @@ class Qwen2AudioMultiModalProcessor( hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: + processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) tokenizer = self.info.get_tokenizer() vocab = tokenizer.get_vocab() @@ -211,7 +248,15 @@ class Qwen2AudioMultiModalProcessor( audio_output_lengths = audio_output_lens.tolist() def get_replacement_qwen2_audio(item_idx: int): - num_features = audio_output_lengths[item_idx] + + if audio_output_lengths: + num_features = audio_output_lengths[item_idx] + else: + audio_embeds = out_mm_data["audio_embeds"][item_idx] + assert len(audio_embeds.shape + ) == 2, "audio_embeds must be a 2D tensor" + num_features = audio_embeds.shape[0] + if num_features == 0: audios = mm_items.get_items("audio", AudioProcessorItems) audio_len = audios.get_audio_length(item_idx) @@ -286,21 +331,39 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, def _parse_and_validate_audio_input( self, **kwargs: object) -> Optional[Qwen2AudioInputs]: input_features = kwargs.pop('input_features', None) + audio_embeds = kwargs.pop('audio_embeds', None) feature_attention_mask = kwargs.pop('feature_attention_mask', None) - if input_features is None: - return None - input_features = self._validate_and_reshape_mm_tensor( - input_features, 'input_features') - feature_attention_mask = self._validate_and_reshape_mm_tensor( - feature_attention_mask, 'feature_attention_mask') - if not isinstance(input_features, (torch.Tensor, list)): - raise ValueError("Incorrect type of audio input features. " - f"Got type: {type(input_features)}") - return Qwen2AudioInputs(input_features=input_features, - feature_attention_mask=feature_attention_mask) - def _process_audio_input(self, - audio_input: Qwen2AudioInputs) -> torch.Tensor: + if input_features is None and audio_embeds is None: + return None + + if audio_embeds is not None: + if not isinstance(audio_embeds, (torch.Tensor, list)): + raise ValueError("Incorrect type of audio embeds. " + f"Got type: {type(audio_embeds)}") + audio_embeds = self._validate_and_reshape_mm_tensor( + audio_embeds, "audio_embeds") + return Qwen2AudioEmbeddingInputs(type="audio_embeds", + audio_embeds=audio_embeds) + + if input_features is not None: + input_features = self._validate_and_reshape_mm_tensor( + input_features, 'input_features') + feature_attention_mask = self._validate_and_reshape_mm_tensor( + feature_attention_mask, 'feature_attention_mask') + return Qwen2AudioFeatureInputs( + type="audio_features", + input_features=input_features, + feature_attention_mask=feature_attention_mask) + + raise AssertionError("This line should be unreachable.") + + def _process_audio_input( + self, audio_input: Qwen2AudioInputs + ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]: + if audio_input["type"] == "audio_embeds": + audio_embeds = audio_input["audio_embeds"] + return tuple(audio_embeds) input_features = audio_input["input_features"] feature_attention_mask = audio_input["feature_attention_mask"] From 7ea22e42d5f666a26b3ce4117724dadfdb4d3887 Mon Sep 17 00:00:00 2001 From: nvjullin <jullin@nvidia.com> Date: Tue, 26 Aug 2025 23:53:04 +0800 Subject: [PATCH 040/112] [Misc] Add override for allreduce fusion thresholds (#23639) Signed-off-by: Julien Lin <jullin@nvidia.com> --- vllm/compilation/collective_fusion.py | 13 +++++++++++++ vllm/envs.py | 11 +++++++++++ 2 files changed, 24 insertions(+) diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py index c44ac8e0aa7ea..0c545d8cffd24 100644 --- a/vllm/compilation/collective_fusion.py +++ b/vllm/compilation/collective_fusion.py @@ -10,6 +10,7 @@ from torch._higher_order_ops.auto_functionalize import auto_functionalized from torch._inductor.pattern_matcher import PatternMatcherPass from torch.distributed._symmetric_memory import enable_symm_mem_for_group +import vllm.envs as envs from vllm.config import VllmConfig from vllm.distributed import get_tp_group, tensor_model_parallel_all_reduce from vllm.distributed.parallel_state import ( @@ -401,6 +402,18 @@ if flashinfer_comm is not None: 6: MiB // 2, # 512KB 8: MiB // 2, # 512KB } + + try: + _FI_MAX_SIZES.update({ + int(k): int(float(v) * MiB) + for k, v in + envs.VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB.items() + }) + except Exception as e: + raise ValueError( + "Failed to parse VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB: " + + str(e)) from e + # opt for a more conservative default value # when world size is not in _FI_MAX_SIZES _DEFAULT_FI_MAX_SIZE = MiB // 2 diff --git a/vllm/envs.py b/vllm/envs.py index 1c9c4cdde8001..66c7c2c7f2c4d 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import hashlib +import json import os import sys import tempfile @@ -1046,6 +1047,16 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE": lambda: int(os.getenv("VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE", "163840")), + # Specifies the thresholds of the communicated tensor sizes under which + # vllm should use flashinfer fused allreduce. The variable should be a + # JSON with the following format: + # { <world size>: <max size in mb> } + # Unspecified world sizes will fallback to + # { 2: 64, 4: 1, <everything else>: 0.5 } + "VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB": + lambda: json.loads(os.getenv( + "VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB", "{}")), + # MoE routing strategy selector. # See `RoutingSimulator.get_available_strategies()` # for available # strategies. From 44ac25eae2cbbdc1cbcca423777107a5ca90a8f4 Mon Sep 17 00:00:00 2001 From: vllmellm <vllm.ellm@embeddedllm.com> Date: Wed, 27 Aug 2025 00:20:13 +0800 Subject: [PATCH 041/112] [CI] [Doc]: Add GH Action for auto labeling issues with `rocm` tag (#20988) Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk> --- .github/workflows/issue_autolabel.yml | 305 ++++++++++++++++++++++++++ 1 file changed, 305 insertions(+) create mode 100644 .github/workflows/issue_autolabel.yml diff --git a/.github/workflows/issue_autolabel.yml b/.github/workflows/issue_autolabel.yml new file mode 100644 index 0000000000000..6401d6586cc3d --- /dev/null +++ b/.github/workflows/issue_autolabel.yml @@ -0,0 +1,305 @@ +name: Label issues based on keywords +on: + issues: + types: [opened, edited, reopened] +permissions: + issues: write # needed so the workflow can add labels + contents: read +concurrency: + group: issue-labeler-${{ github.event.issue.number }} + cancel-in-progress: true +jobs: + add-labels: + runs-on: ubuntu-latest + steps: + - name: Label issues based on keywords + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 + with: + script: | + // Configuration: Add new labels and keywords here + const labelConfig = { + rocm: { + // Keyword search - matches whole words only (with word boundaries) + keywords: [ + { + term: "composable kernel", + searchIn: "both" + }, + { + term: "rccl", + searchIn: "body" // only search in body + }, + { + term: "migraphx", + searchIn: "title" // only search in title + }, + { + term: "hipgraph", + searchIn: "both" + }, + { + term: "ROCm System Management Interface", + searchIn: "body" + }, + ], + + // Substring search - matches anywhere in text (partial matches) + substrings: [ + { + term: "VLLM_ROCM_", + searchIn: "both" + }, + { + term: "rocm", + searchIn: "title" + }, + { + term: "amd", + searchIn: "title" + }, + { + term: "hip-", + searchIn: "both" + }, + { + term: "gfx", + searchIn: "both" + }, + { + term: "cdna", + searchIn: "both" + }, + { + term: "rdna", + searchIn: "both" + }, + { + term: "torch_hip", + searchIn: "body" // only in body + }, + { + term: "_hip", + searchIn: "both" + }, + { + term: "hip_", + searchIn: "both" + }, + + // ROCm tools and libraries + { + term: "hipify", + searchIn: "both" + }, + ], + + // Regex patterns - for complex pattern matching + regexPatterns: [ + { + pattern: "\\bmi\\d{3}[a-z]*\\b", + description: "AMD GPU names (mi + 3 digits + optional letters)", + flags: "gi", + searchIn: "both" // "title", "body", or "both" + } + ], + }, + }; + + // Helper function to create regex based on search type + function createSearchRegex(term, type) { + // Escape special regex characters in the term + const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); + + switch (type) { + case 'keyword': + // Word boundary search - matches whole words only + return new RegExp(`\\b${escapedTerm}\\b`, "gi"); + case 'substring': + // Substring search - matches anywhere in the text + return new RegExp(escapedTerm, "gi"); + default: + throw new Error(`Unknown search type: ${type}`); + } + } + + // Helper function to find matching terms in text with line information + function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') { + const matches = []; + const lines = text.split('\n'); + + for (const termConfig of searchTerms) { + let regex; + let term, searchIn, pattern, description, flags; + + // Handle different input formats (string or object) + if (typeof termConfig === 'string') { + term = termConfig; + searchIn = 'both'; // default + } else { + term = termConfig.term; + searchIn = termConfig.searchIn || 'both'; + pattern = termConfig.pattern; + description = termConfig.description; + flags = termConfig.flags; + } + + // Skip if this term shouldn't be searched in the current location + if (searchIn !== 'both' && searchIn !== searchLocation) { + continue; + } + + // Create appropriate regex + if (searchType === 'regex') { + regex = new RegExp(pattern, flags || "gi"); + } else { + regex = createSearchRegex(term, searchType); + } + + const termMatches = []; + + // Check each line for matches + lines.forEach((line, lineIndex) => { + const lineMatches = line.match(regex); + if (lineMatches) { + lineMatches.forEach(match => { + termMatches.push({ + match: match, + lineNumber: lineIndex + 1, + lineContent: line.trim(), + searchType: searchType, + searchLocation: searchLocation, + originalTerm: term || pattern, + description: description, + // Show context around the match in the line + context: line.length > 100 ? + line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30), + line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...' + : line.trim() + }); + }); + } + }); + + if (termMatches.length > 0) { + matches.push({ + term: term || (description || pattern), + searchType: searchType, + searchLocation: searchLocation, + searchIn: searchIn, + pattern: pattern, + matches: termMatches, + count: termMatches.length + }); + } + } + + return matches; + } + + // Helper function to check if label should be added + async function processLabel(labelName, config) { + const body = context.payload.issue.body || ""; + const title = context.payload.issue.title || ""; + + core.notice(`Processing label: ${labelName}`); + core.notice(`Issue Title: "${title}"`); + core.notice(`Issue Body length: ${body.length} characters`); + + let shouldAddLabel = false; + let allMatches = []; + let reason = ''; + + const keywords = config.keywords || []; + const substrings = config.substrings || []; + const regexPatterns = config.regexPatterns || []; + + core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`); + + // Search in title + if (title.trim()) { + core.notice(`Searching in title: "${title}"`); + + const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title'); + const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title'); + const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title'); + + allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches); + } + + // Search in body + if (body.trim()) { + core.notice(`Searching in body (${body.length} characters)`); + + const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body'); + const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body'); + const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body'); + + allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches); + } + + if (allMatches.length > 0) { + core.notice(`Found ${allMatches.length} matching term(s):`); + + for (const termMatch of allMatches) { + const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body'; + const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn; + + if (termMatch.searchType === 'regex') { + core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`); + } else { + core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`); + } + + // Show details for each match + termMatch.matches.forEach((match, index) => { + core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`); + if (match.description) { + core.notice(` Description: ${match.description}`); + } + core.notice(` Context: ${match.context}`); + if (match.lineContent !== match.context) { + core.notice(` Full line: ${match.lineContent}`); + } + }); + } + + shouldAddLabel = true; + const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0); + const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0); + const bodyMatches = allMatches.filter(t => t.searchLocation === 'body').reduce((sum, t) => sum + t.count, 0); + const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0); + const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0); + const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0); + + reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`; + } + + core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`); + core.notice(`Reason: ${reason || 'No matching terms found'}`); + + if (shouldAddLabel) { + const existingLabels = context.payload.issue.labels.map(l => l.name); + if (!existingLabels.includes(labelName)) { + await github.rest.issues.addLabels({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + labels: [labelName], + }); + core.notice(`Label "${labelName}" added. ${reason}`); + return true; + } + core.notice(`Label "${labelName}" already present.`); + return false; + } + + core.notice(`No matching terms found for label "${labelName}".`); + return false; + } + + // Process all configured labels + const processLabels = Object.entries(labelConfig) + .map(([labelName, config]) => processLabel(labelName, config)); + const labelsAdded = await Promise.all(processLabels); + const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0); + core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`); \ No newline at end of file From 9b0187003e62bdb7311b23b5b5026ea8e4e207d3 Mon Sep 17 00:00:00 2001 From: "Li, Jiang" <jiang1.li@intel.com> Date: Wed, 27 Aug 2025 01:10:42 +0800 Subject: [PATCH 042/112] [Bugfix] Fix cuda event usage with CPU model runner (#23643) Signed-off-by: jiang1.li <jiang1.li@intel.com> --- vllm/v1/worker/cpu_model_runner.py | 28 +++++++++++++++++++++++++--- vllm/v1/worker/gpu_model_runner.py | 2 +- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py index a7180afbd64b5..137578f0e6088 100644 --- a/vllm/v1/worker/cpu_model_runner.py +++ b/vllm/v1/worker/cpu_model_runner.py @@ -11,6 +11,7 @@ from vllm.logger import init_logger from vllm.model_executor.model_loader import get_model from vllm.v1.attention.backends.cpu_attn import TorchSDPAMetadataBuilderV1 from vllm.v1.worker.gpu_model_runner import GPUModelRunner +from vllm.v1.worker.utils import CpuGpuBuffer if TYPE_CHECKING: from vllm.v1.core.sched.output import SchedulerOutput @@ -21,7 +22,8 @@ logger = init_logger(__name__) class CPUModelRunner(GPUModelRunner): def __init__(self, vllm_config: VllmConfig, device: torch.device): - super().__init__(vllm_config, device) + with _torch_cuda_wrapper(): + super().__init__(vllm_config, device) assert device == torch.device("cpu") assert self.speculative_config is None, "spec decode is not supported." @@ -71,8 +73,8 @@ class CPUModelRunner(GPUModelRunner): setattr(obj, device_attr_name, cpu_tensor) for k, v in vars(self).items(): - if k.endswith("_cpu") and isinstance(v, torch.Tensor): - replace_tensor(self, k, k[:-4]) + if isinstance(v, CpuGpuBuffer): + v.gpu = v.cpu for k, v in vars(self.input_batch).items(): if k.endswith("_cpu_tensor") and isinstance(v, torch.Tensor): @@ -108,6 +110,26 @@ class CPUModelRunner(GPUModelRunner): def _sync_device(self) -> None: pass + def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]: + return sampled_token_ids.tolist() + + +@contextmanager +def _torch_cuda_wrapper(): + + class _EventPlaceholder: + + def __init__(self, *args, **kwargs) -> None: + self.record = lambda: None + self.synchronize = lambda: None + + try: + cuda_event = torch.cuda.Event + torch.cuda.Event = _EventPlaceholder + yield + finally: + torch.cuda.Event = cuda_event + @contextmanager def _set_global_compilation_settings(config: VllmConfig): diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 14f2305dadc54..f1ceaaae62a70 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -321,7 +321,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): (self.max_model_len, 1), dtype=torch.int64, device="cpu", - pin_memory=True) + pin_memory=self.pin_memory) def _make_buffer(self, *args, dtype: torch.dtype) -> CpuGpuBuffer: return CpuGpuBuffer(*args, From 730d0ac8b9678d64294ddc1e3431a27a50b5e42f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hyogeun=20Oh=20=28=EC=98=A4=ED=9A=A8=EA=B7=BC=29?= <ohg3417@gmail.com> Date: Wed, 27 Aug 2025 03:19:23 +0900 Subject: [PATCH 043/112] [Docs] Fix warnings in `mkdocs build` (#23649) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Zerohertz <ohg3417@gmail.com> Signed-off-by: Hyogeun Oh (오효근) <ohg3417@gmail.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .../backends/differential_flash_attn.py | 14 ++++++---- vllm/attention/backends/flash_attn.py | 5 ++-- vllm/attention/backends/rocm_flash_attn.py | 11 ++++---- vllm/attention/backends/utils.py | 2 +- vllm/attention/backends/xformers.py | 12 ++++----- vllm/core/block_manager.py | 8 +++--- vllm/engine/async_llm_engine.py | 4 +-- vllm/engine/llm_engine.py | 8 +++--- vllm/entrypoints/llm.py | 10 +++---- .../tool_parsers/minimax_tool_parser.py | 3 ++- vllm/model_executor/layers/lightning_attn.py | 11 +++++++- vllm/model_executor/layers/linear.py | 5 ++-- vllm/outputs.py | 4 +-- vllm/sequence.py | 27 +++++++------------ 14 files changed, 66 insertions(+), 58 deletions(-) diff --git a/vllm/attention/backends/differential_flash_attn.py b/vllm/attention/backends/differential_flash_attn.py index ce9467efd23c7..caa02530d2fd6 100644 --- a/vllm/attention/backends/differential_flash_attn.py +++ b/vllm/attention/backends/differential_flash_attn.py @@ -805,14 +805,18 @@ class DifferentialFlashAttentionImpl(AttentionImpl): """Forward pass with FlashAttention. Args: - query: shape = [num_tokens, num_heads, head_size] - key: shape = [num_tokens, num_kv_heads, head_size] - value: shape = [num_tokens, num_kv_heads, head_size] - output: shape = [num_tokens, num_heads, head_size] - kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size] + layer: Attention layer instance. + q: Query tensor with shape = [num_tokens, num_heads, head_size] + k: Key tensor with shape = [num_tokens, num_kv_heads, head_size] + v: Value tensor with shape = [num_tokens, num_kv_heads, head_size] + kv_cache: KV cache tensor with shape + [2, num_blocks, block_size, num_kv_heads, head_size]. NOTE: kv_cache will be an empty tensor with shape [0] for profiling run. attn_metadata: Metadata for attention. + output: Output tensor with shape [num_tokens, num_heads, head_size] + output_scale: Optional output scale tensor. + output_block_scale: Optional output block scale tensor. NOTE: It in-place updates the output tensor. NOTE: FP8 quantization, flash-attn expect the size of {q,k,v}_descale to be (num_sequences, num_kv_heads). diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index ba7a9afe86782..d8cb208c4f2ea 100755 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -605,7 +605,8 @@ class FlashAttentionImpl(AttentionImpl): key: shape = [num_tokens, num_kv_heads, head_size] value: shape = [num_tokens, num_kv_heads, head_size] output: shape = [num_tokens, num_heads, head_size] - kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size] + kv_cache: KV cache tensor with shape + [2, num_blocks, block_size, num_kv_heads, head_size]. NOTE: kv_cache will be an empty tensor with shape [0] for profiling run. attn_metadata: Metadata for attention. @@ -850,7 +851,7 @@ class FlashAttentionImpl(AttentionImpl): def _get_query_key_seq_metadata( - attn_metadata, + attn_metadata: FlashAttentionMetadata, is_prompt: bool, attn_type: str, ) -> tuple: diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index e4c27a0ef36e9..9262144e37b54 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -584,17 +584,18 @@ class ROCmFlashAttentionImpl(AttentionImpl): use prefill sequence attributes Args: + layer: Attention layer instance. query: shape = [num_tokens, num_heads * head_size] key: shape = [num_tokens, num_kv_heads * head_size] value: shape = [num_tokens, num_kv_heads * head_size] - kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size] + kv_cache: KV cache tensor with shape + [2, num_blocks, block_size * num_kv_heads * head_size]. NOTE: kv_cache will be an empty tensor with shape [0] for profiling run. attn_metadata: Metadata for attention. - attn_type: Select attention type, between encoder attention, - decoder self-attention, or encoder/decoder cross- - attention. Defaults to decoder self-attention, - which is the vLLM default generally + output: Optional output tensor. + output_scale: Optional output scale tensor. + output_block_scale: Optional output block scale tensor. Returns: shape = [num_tokens, num_heads * head_size] """ diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index 34e059067d84d..7b6c426b0f851 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -561,7 +561,7 @@ def get_num_prefill_decode_query_kv_tokens( Raises: AssertionError: If the number of encoder tokens in `attn_metadata` - is `None` when required for the calculations. + is `None` when required for the calculations. """ num_prefill_query_tokens = 0 num_decode_query_tokens = 0 diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index c1213f7620a7a..302d3d7ea903f 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -471,17 +471,18 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]): max_encoder_seq_len) Args: + layer: Attention layer instance. query: shape = [num_tokens, num_heads * head_size] key: shape = [num_tokens, num_kv_heads * head_size] value: shape = [num_tokens, num_kv_heads * head_size] - kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size] + kv_cache: KV cache tensor with shape + [2, num_blocks, block_size * num_kv_heads * head_size]. NOTE: kv_cache will be an empty tensor with shape [0] for profiling run. attn_metadata: Metadata for attention. - attn_type: Select attention type, between encoder attention, - decoder self-attention, or encoder/decoder cross- - attention. Defaults to decoder self-attention, - which is the vLLM default generally + output: Optional output tensor. + output_scale: Optional output scale tensor. + output_block_scale: Optional output block scale tensor. Returns: shape = [num_tokens, num_heads * head_size] """ @@ -644,7 +645,6 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]): for API spec. Args: - output: shape = [num_prefill_tokens, num_heads, head_size] query: shape = [num_prefill_tokens, num_heads, head_size] key: shape = [num_prefill_tokens, num_kv_heads, head_size] value: shape = [num_prefill_tokens, num_kv_heads, head_size] diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 4ec5a775f465c..cbfa4d7ff3c4c 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -352,7 +352,7 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager): with num_lookahead_slots. Args: - sequence_group (SequenceGroup): The sequence group to swap in. + seq_group (SequenceGroup): The sequence group to swap in. num_lookahead_slots (int): Number of lookahead slots used in speculative decoding, default to 0. @@ -405,8 +405,6 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager): Args: seq_group (SequenceGroup): The sequence group to swap out. - num_lookahead_slots (int): Number of lookahead slots used in - speculative decoding, default to 0. Returns: bool: Whether it's possible to swap out current sequence group. @@ -420,7 +418,7 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager): swapping out the given sequence_group with num_lookahead_slots. Args: - sequence_group (SequenceGroup): The sequence group to swap out. + seq_group (SequenceGroup): The sequence group to swap out. Returns: List[Tuple[int, int]]: The mapping of swapping block from @@ -473,7 +471,7 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager): on to the 'device'. Args: - sequence_group (SequenceGroup): The sequence group to swap in/out. + seq_group (SequenceGroup): The sequence group to swap in/out. device (Device): device to swap the 'seq_group' on. status (SequenceStatus): The status of sequence which is needed for action. RUNNING for swap out and SWAPPED for swap in diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 84ad2299b0655..4fb028627a8c4 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -486,10 +486,10 @@ class AsyncLLMEngine(EngineClient): _engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine def __init__(self, - *args, + *args: Any, log_requests: bool = True, start_engine_loop: bool = True, - **kwargs) -> None: + **kwargs: Any) -> None: if envs.VLLM_USE_V1: raise ValueError( "Using V0 AsyncLLMEngine, but envs.VLLM_USE_V1=True. " diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index dbf8d3ba50146..cbd714c159eb5 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -644,10 +644,10 @@ class LLMEngine: Details: - Set arrival_time to the current time if it is None. - Set prompt_token_ids to the encoded prompt if it is None. - - Create `n` number of [Sequence][vllm.Sequence] objects. - - Create a [SequenceGroup][vllm.SequenceGroup] object - from the list of [Sequence][vllm.Sequence]. - - Add the [SequenceGroup][vllm.SequenceGroup] object to the + - Create `n` number of [Sequence][vllm.sequence.Sequence] objects. + - Create a [SequenceGroup][vllm.sequence.SequenceGroup] object + from the list of [Sequence][vllm.sequence.Sequence]. + - Add the [SequenceGroup][vllm.sequence.SequenceGroup] object to the scheduler. Example: diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 728ed8328d36d..8816ff56d6840 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -186,7 +186,7 @@ class LLM: CompilationConfig]] = None, logits_processors: Optional[list[Union[str, type[LogitsProcessor]]]] = None, - **kwargs, + **kwargs: Any, ) -> None: """LLM constructor.""" @@ -697,8 +697,8 @@ class LLM: Generate responses for a chat conversation. The chat conversation is converted into a text prompt using the - tokenizer and calls the [generate][] method to generate the - responses. + tokenizer and calls the [generate][vllm.LLM.generate] method to generate + the responses. Multi-modal inputs can be passed in the same way you would pass them to the OpenAI API. @@ -1334,8 +1334,8 @@ class LLM: def wake_up(self, tags: Optional[list[str]] = None): """ - Wake up the engine from sleep mode. See the [sleep][] method - for more details. + Wake up the engine from sleep mode. See the [sleep][vllm.LLM.sleep] + method for more details. Args: tags: An optional list of tags to reallocate the engine memory diff --git a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py index 283e6095013d6..0fd62f0b6a7f1 100644 --- a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py @@ -461,7 +461,8 @@ class MinimaxToolParser(ToolParser): i += 1 return boundaries - def _extract_tool_args(self, tool_content: str, args_match) -> str: + def _extract_tool_args(self, tool_content: str, + args_match: re.Match[str]) -> str: """ Extract tool arguments from tool content. diff --git a/vllm/model_executor/layers/lightning_attn.py b/vllm/model_executor/layers/lightning_attn.py index 8ffc700ca5cde..0b87acc851208 100644 --- a/vllm/model_executor/layers/lightning_attn.py +++ b/vllm/model_executor/layers/lightning_attn.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Optional + import torch from einops import rearrange @@ -453,7 +455,14 @@ class _attention(torch.autograd.Function): lightning_attention_ = _attention.apply -def lightning_attention(q, k, v, ed, block_size=256, kv_history=None): +def lightning_attention( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + ed: torch.Tensor, + block_size: int = 256, + kv_history: Optional[torch.Tensor] = None +) -> tuple[torch.Tensor, torch.Tensor]: """ Apply lightning attention algorithm to compute attention efficiently. diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index dd54aebeb011e..c0fcacd1e6ee9 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -233,10 +233,10 @@ class LinearBase(CustomOp): Args: input_size: input dimension of the linear layer. output_size: output dimension of the linear layer. - bias: If true, add bias. skip_bias_add: If true, skip adding bias but instead return it. params_dtype: Data type for the parameters. quant_config: Quantization configure. + prefix: Prefix for parameter names. return_bias: If true, return bias together with outputs in forward pass. """ @@ -378,13 +378,14 @@ class MergedReplicatedLinear(ReplicatedLinear): Args: input_size: input dimension of the linear layer. - output_size: output dimension of the linear layer. + output_sizes: list of output dimensions of the linear layer. bias: If true, add bias. skip_bias_add: If true, skip adding bias but instead return it. params_dtype: Data type for the parameters. quant_config: Quantization configure. prefix: The name of the layer in the state dict, including all parents (e.g. model.layers.0.qkv_proj) + return_bias: If true, return bias together with outputs in forward pass. """ def __init__( diff --git a/vllm/outputs.py b/vllm/outputs.py index 9784a8894472f..acdb2f89ce735 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -409,7 +409,7 @@ class EmbeddingOutput: Args: embedding: The embedding vector, which is a list of floats. - Its length depends on the hidden dimension of the model. + Its length depends on the hidden dimension of the model. """ embedding: list[float] @@ -447,7 +447,7 @@ class ClassificationOutput: Args: probs: The probability vector, which is a list of floats. - Its length depends on the number of classes. + Its length depends on the number of classes. """ probs: list[float] diff --git a/vllm/sequence.py b/vllm/sequence.py index 3c4c77aea5ed8..36b1b198bd5a5 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -147,18 +147,7 @@ class SequenceDataDelta( class SequenceData(msgspec.Struct, omit_defaults=True): # type: ignore[call-arg] - """Data associated with a sequence. - - Args: - prompt_token_ids: The token IDs of the prompt. - output_token_ids: The token IDs of the output. Set to an empty list if - None. - - Attributes: - prompt_token_ids: The token IDs of the prompt. - output_token_ids: The token IDs of the output. - cumulative_logprob: The cumulative log probability of the output. - """ + """Data associated with a sequence.""" # NOTE: we cannot use Union[list, array] because msgspec cannot support # union of 2 list types. _prompt_token_ids: array @@ -256,10 +245,12 @@ class SequenceData(msgspec.Struct, @property def cumulative_logprob(self) -> float: + """The cumulative log probability of the output.""" return self._cumulative_logprob @property def prompt_token_ids(self) -> tuple[int, ...]: + """The token IDs of the prompt.""" return self._prompt_token_ids_tuple @prompt_token_ids.setter @@ -277,6 +268,7 @@ class SequenceData(msgspec.Struct, @property def output_token_ids(self) -> tuple[int, ...]: + """The token IDs of the output.""" return tuple(self._output_token_ids) @output_token_ids.setter @@ -940,7 +932,7 @@ class SequenceGroupMetadata( omit_defaults=True): # type: ignore[call-arg] """Metadata for a sequence group. Used to create `AttentionMetadata`. - Args: + Attributes: request_id: The ID of the request. is_prompt: Whether the request is at prompt stage. seq_data: The sequence data. (Seq id -> sequence data) @@ -950,14 +942,14 @@ class SequenceGroupMetadata( do_sample: True if sampling is required. Sampling is not required when e.g., prefill is chunked, and the current iteration only computes query tokens for prefill, we don't need sampling. - token_chunk_size: The number of tokens to be processed (per sequence). - None if chunking is not required. + pooling_params: Pooling parameters. lora_request: LoRA request. computed_block_nums: The block numbers that are already computed, used in prefix caching. state: Internal state tied to this sequence group. + token_type_ids: Token type IDs. multi_modal_data: Multi modal data. - mm_processor_kwargs: Multimodal input processor / mapper overrides. + multi_modal_placeholders: Multi modal placeholders. encoder_seq_data: Optional sequence data for encoder prompt (SequenceGroup.encoder_seq). Should be None unless you are working with an encoder/decoder @@ -1043,12 +1035,13 @@ class SequenceOutput( array_like=True): # type: ignore[call-arg] """The model output associated with a sequence. - Args: + Attributes: parent_seq_id: The ID of the parent sequence (for forking in beam search). output_token: The output token ID. logprobs: The logprobs of the output token. (Token id -> logP(x_i+1 | x_0, ..., x_i)) + output_embed: Optional output embedding tensor. """ parent_seq_id: int output_token: int From 227e231b55901be4e050d5a8f033e90f45cfba85 Mon Sep 17 00:00:00 2001 From: Thomas Parnell <tpa@zurich.ibm.com> Date: Tue, 26 Aug 2025 20:33:16 +0200 Subject: [PATCH 044/112] [Docs] [V1] [Hybrid] Update docs to remove FlashInfer constraint for hybrid models (#23665) Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> --- docs/usage/v1_guide.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md index 7fc615d4c042f..64bd0d9bf5071 100644 --- a/docs/usage/v1_guide.md +++ b/docs/usage/v1_guide.md @@ -111,11 +111,10 @@ Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaFor Models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`, `Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`). Please note that -these models currently require disabling prefix caching and using the FlashInfer attention backend in V1. +these models currently require disabling prefix caching in V1. Hybrid models with mechanisms different to Mamba are also supported (e.g, `MiniMaxText01ForCausalLM`, `MiniMaxM1ForCausalLM`). -Please note that these models currently require disabling prefix caching, enforcing eager mode, and using the FlashInfer -attention backend in V1. +Please note that these models currently require disabling prefix caching and enforcing eager mode in V1. #### Encoder-Decoder Models From 98aa16ff41353e3e6c8a3c2f4e933a888dbce1cb Mon Sep 17 00:00:00 2001 From: Russell Bryant <rbryant@redhat.com> Date: Tue, 26 Aug 2025 14:49:06 -0400 Subject: [PATCH 045/112] [v1] Add cross-attention KV cache support for encoder-decoder models (#23664) Signed-off-by: Russell Bryant <rbryant@redhat.com> --- vllm/multimodal/registry.py | 19 +++++++ vllm/v1/core/kv_cache_coordinator.py | 34 ++++++++---- vllm/v1/core/kv_cache_manager.py | 6 ++- vllm/v1/core/sched/scheduler.py | 37 ++++++++++++- vllm/v1/core/single_type_kv_cache_manager.py | 56 +++++++++++++++++++- vllm/v1/kv_cache_interface.py | 15 ++++++ 6 files changed, 153 insertions(+), 14 deletions(-) diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index ded56cca80999..8cd9e5604872a 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -372,3 +372,22 @@ class MultiModalRegistry: ) return dummy_data + + def get_encdec_max_encoder_len(self, model_config: "ModelConfig") -> int: + """ + Get the maximum length of the encoder input for encoder-decoder models. + """ + if not model_config.is_encoder_decoder: + return 0 + max_tokens = self.\ + get_max_tokens_per_item_by_nonzero_modality(model_config) + if not max_tokens: + # TODO - this function assumes encoder-decoder models are + # multimodal. This will need to change when adding support for more + # than whisper. + return 0 + assert len(max_tokens) == 1, "Encoder-decoder models are expected \ + to implement the multimodal interface with at most one modality." + + first_modality = next(iter(max_tokens)) + return max_tokens[first_modality] diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py index a0ea4d96015a2..f082ad00f2e35 100644 --- a/vllm/v1/core/kv_cache_coordinator.py +++ b/vllm/v1/core/kv_cache_coordinator.py @@ -6,7 +6,7 @@ from typing import Optional from vllm.v1.core.block_pool import BlockPool from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock from vllm.v1.core.single_type_kv_cache_manager import ( - FullAttentionManager, get_manager_for_kv_cache_spec) + CrossAttentionManager, FullAttentionManager, get_manager_for_kv_cache_spec) from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheSpec) from vllm.v1.request import Request @@ -42,9 +42,10 @@ class KVCacheCoordinator(ABC): ) for i, kv_cache_group in enumerate( self.kv_cache_config.kv_cache_groups)) - def get_num_blocks_to_allocate( - self, request_id: str, num_tokens: int, - new_computed_blocks: tuple[list[KVCacheBlock], ...]) -> int: + def get_num_blocks_to_allocate(self, request_id: str, num_tokens: int, + new_computed_blocks: tuple[ + list[KVCacheBlock], ...], + num_encoder_tokens: int) -> int: """ Get the number of blocks needed to be allocated for the request. @@ -54,14 +55,22 @@ class KVCacheCoordinator(ABC): tokens that are already allocated). new_computed_blocks: The new computed blocks just hitting the prefix caching. + num_encoder_tokens: The number of encoder tokens for allocating + blocks for cross-attention. Returns: The number of blocks. """ num_blocks_to_allocate = 0 for i, manager in enumerate(self.single_type_managers): - num_blocks_to_allocate += manager.get_num_blocks_to_allocate( - request_id, num_tokens, new_computed_blocks[i]) + if isinstance(manager, CrossAttentionManager): + # For cross-attention, we issue a single static allocation + # of blocks based on the number of encoder input tokens. + num_blocks_to_allocate += manager.get_num_blocks_to_allocate( + request_id, num_encoder_tokens, []) + else: + num_blocks_to_allocate += manager.get_num_blocks_to_allocate( + request_id, num_tokens, new_computed_blocks[i]) return num_blocks_to_allocate def save_new_computed_blocks( @@ -79,8 +88,11 @@ class KVCacheCoordinator(ABC): manager.save_new_computed_blocks(request_id, new_computed_blocks[i]) - def allocate_new_blocks(self, request_id: str, - num_tokens: int) -> tuple[list[KVCacheBlock], ...]: + def allocate_new_blocks( + self, + request_id: str, + num_tokens: int, + num_encoder_tokens: int = 0) -> tuple[list[KVCacheBlock], ...]: """ Allocate new blocks for the request to give it at least `num_tokens` token slots. @@ -89,12 +101,16 @@ class KVCacheCoordinator(ABC): request_id: The request ID. num_tokens: The total number of tokens that need a slot (including tokens that are already allocated). + num_encoder_tokens: The number of encoder tokens for allocating + blocks for cross-attention. Returns: The new allocated blocks. """ return tuple( - manager.allocate_new_blocks(request_id, num_tokens) + manager.allocate_new_blocks( + request_id, num_encoder_tokens if isinstance( + manager, CrossAttentionManager) else num_tokens) for manager in self.single_type_managers) def cache_blocks(self, request: Request, num_computed_tokens: int) -> None: diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index fd0bdb2c80fc5..b427a9c497fef 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -187,6 +187,7 @@ class KVCacheManager: new_computed_blocks: Optional[KVCacheBlocks] = None, num_lookahead_tokens: int = 0, delay_cache_blocks: bool = False, + num_encoder_tokens: int = 0, ) -> Optional[KVCacheBlocks]: """Add slots for a request with new tokens to append. @@ -253,6 +254,7 @@ class KVCacheManager: request_id=request.request_id, num_tokens=num_tokens_need_slot, new_computed_blocks=new_computed_block_list, + num_encoder_tokens=num_encoder_tokens, ) if num_blocks_to_allocate > self.block_pool.get_num_free_blocks(): @@ -273,7 +275,7 @@ class KVCacheManager: new_computed_block_list) new_blocks = self.coordinator.allocate_new_blocks( - request.request_id, num_tokens_need_slot) + request.request_id, num_tokens_need_slot, num_encoder_tokens) # P/D: delay caching blocks if we have to recv from # remote. Update state for locally cached blocks. @@ -292,7 +294,7 @@ class KVCacheManager: def free(self, request: Request) -> None: """Free the blocks allocated for the request. - We free the blocks in reverse order so that he tail blocks are evicted + We free the blocks in reverse order so that the tail blocks are evicted first when caching is enabled. Args: diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 522b340b32aaf..14a914d8f2f0b 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -58,6 +58,7 @@ class Scheduler(SchedulerInterface): self.parallel_config = vllm_config.parallel_config self.log_stats = log_stats self.structured_output_manager = structured_output_manager + self.is_encoder_decoder = vllm_config.model_config.is_encoder_decoder # include_finished_set controls whether a separate set of finished # request ids should be included in the EngineCoreOutputs returned @@ -83,6 +84,9 @@ class Scheduler(SchedulerInterface): assert len(self.kv_cache_config.kv_cache_groups) == 1, ( "Multiple KV cache groups are not currently supported " "with KV connectors") + assert not self.is_encoder_decoder, ( + "Encoder-decoder models are not currently supported " + "with KV connectors") self.connector = KVConnectorFactory.create_connector( config=self.vllm_config, role=KVConnectorRole.SCHEDULER) @@ -431,6 +435,22 @@ class Scheduler(SchedulerInterface): == 0 else self.num_lookahead_tokens) + # Determine if we need to allocate cross-attention blocks. + if self.is_encoder_decoder and request.has_encoder_inputs: + # TODO(russellb): For Whisper, we know that the input is + # always padded to the maximum length. If we support other + # encoder-decoder models, this will need to be updated if we + # want to only allocate what is needed. + assert ("whisper" + in self.vllm_config.model_config.model.lower()), ( + "Whisper is the only supported " + "encoder-decoder model.") + num_encoder_tokens = MULTIMODAL_REGISTRY.\ + get_encdec_max_encoder_len( + self.vllm_config.model_config) + else: + num_encoder_tokens = 0 + new_blocks = self.kv_cache_manager.allocate_slots( request, num_new_tokens + num_external_computed_tokens, @@ -438,6 +458,7 @@ class Scheduler(SchedulerInterface): new_computed_blocks, num_lookahead_tokens=effective_lookahead_tokens, delay_cache_blocks=load_kv_async, + num_encoder_tokens=num_encoder_tokens, ) if new_blocks is None: @@ -703,7 +724,21 @@ class Scheduler(SchedulerInterface): # The encoder input is not needed in this step. break - if start_pos + num_encoder_tokens <= num_computed_tokens: + if self.is_encoder_decoder and num_computed_tokens > 0: + assert start_pos == 0, ( + "Encoder input should be processed at the beginning of " + "the sequence when encoder-decoder models are used.") + # Encoder input has already been computed + # The calculation here is a bit different. We don't turn encoder + # output into tokens that get processed by the decoder and + # reflected in num_computed_tokens. Instead, start_pos reflects + # the position where we need to ensure we calculate encoder + # inputs. This should always be 0 to ensure we calculate encoder + # inputs before running the decoder. Once we've calculated some + # decoder tokens (num_computed_tokens > 0), then we know we + # already calculated encoder inputs and can skip here. + continue + elif start_pos + num_encoder_tokens <= num_computed_tokens: # The encoder input is already computed and stored # in the decoder's KV cache. continue diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py index 82e0292522b9a..f0af92122958c 100644 --- a/vllm/v1/core/single_type_kv_cache_manager.py +++ b/vllm/v1/core/single_type_kv_cache_manager.py @@ -8,8 +8,9 @@ from vllm.utils import cdiv from vllm.v1.core.block_pool import BlockPool from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec, - FullAttentionSpec, KVCacheSpec, - MambaSpec, SlidingWindowSpec) + CrossAttentionSpec, FullAttentionSpec, + KVCacheSpec, MambaSpec, + SlidingWindowSpec) from vllm.v1.request import Request @@ -552,11 +553,62 @@ class MambaManager(SingleTypeKVCacheManager): return new_blocks +class CrossAttentionManager(SingleTypeKVCacheManager): + """Manager for cross-attention KV cache in encoder-decoder models.""" + + def save_new_computed_blocks( + self, request_id: str, + new_computed_blocks: list[KVCacheBlock]) -> None: + # We do not cache blocks for cross-attention to be shared between + # requests, so `new_computed_blocks` should always be empty. + assert len(new_computed_blocks) == 0 + + def cache_blocks(self, request: Request, num_tokens: int) -> None: + # We do not cache blocks for cross-attention to be shared between + # requests, so this method is not relevant. + raise ValueError("Should not be called as prefix caching is disabled.") + + def get_num_common_prefix_blocks(self, request_id: str, + num_running_requests: int) -> int: + # Cross-attention blocks contain request-specific encoder states + # and are not shared between different requests + return 0 + + @classmethod + def find_longest_cache_hit( + cls, + block_hashes: list[BlockHash], + max_length: int, + kv_cache_group_ids: list[int], + block_pool: BlockPool, + kv_cache_spec: KVCacheSpec, + use_eagle: bool, + ) -> tuple[list[KVCacheBlock], ...]: + assert isinstance(kv_cache_spec, CrossAttentionSpec), ( + "CrossAttentionManager can only be used for cross-attention groups" + ) + # Cross-attention does not benefit from prefix caching since: + # 1. Encoder states are unique per request (different audio/image + # inputs) + # 2. Encoder states are computed once per request, not incrementally + # 3. No reusable prefix exists between different multimodal inputs + # Return empty blocks to indicate no cache hits + raise NotImplementedError( + "CrossAttentionManager does not support caching") + + def remove_skipped_blocks(self, request_id: str, + num_computed_tokens: int) -> None: + # Cross-attention blocks represent encoder states which are needed + # for the entire decoding process, so no blocks should be skipped + pass + + spec_manager_map: dict[type[KVCacheSpec], type[SingleTypeKVCacheManager]] = { FullAttentionSpec: FullAttentionManager, SlidingWindowSpec: SlidingWindowManager, ChunkedLocalAttentionSpec: ChunkedLocalAttentionManager, MambaSpec: MambaManager, + CrossAttentionSpec: CrossAttentionManager, } diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py index ed8e0bf798988..a3e4d393e4d20 100644 --- a/vllm/v1/kv_cache_interface.py +++ b/vllm/v1/kv_cache_interface.py @@ -11,6 +11,7 @@ from typing_extensions import Self from vllm.config import VllmConfig from vllm.logger import init_logger +from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.utils import cdiv, get_dtype_size logger = init_logger(__name__) @@ -211,6 +212,20 @@ class EncoderOnlyAttentionSpec(AttentionSpec): return 0 +@dataclass(frozen=True) +class CrossAttentionSpec(AttentionSpec): + """ + KV cache spec for cross-attention layers in encoder-decoder models. + """ + + def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int: + # For cross-attention, we need to cache encoder states + # Get encoder length (e.g., 1500 for Whisper). + max_encoder_len = MULTIMODAL_REGISTRY.\ + get_encdec_max_encoder_len(vllm_config.model_config) + return cdiv(max_encoder_len, self.block_size) * self.page_size_bytes + + @dataclass class KVCacheTensor: """ From 9715f7bb0fd70fa3dac6f35c824e90e58f0086ce Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Wed, 27 Aug 2025 03:01:25 +0800 Subject: [PATCH 046/112] [Bugfix] Fix incorrect original shape in hashing (#23672) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Co-authored-by: Lukas Geiger <lukas.geiger94@gmail.com> --- tests/multimodal/test_hasher.py | 7 ++++--- vllm/multimodal/hasher.py | 10 ++++++++-- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/tests/multimodal/test_hasher.py b/tests/multimodal/test_hasher.py index 75a233c2567cb..2751e38760e17 100644 --- a/tests/multimodal/test_hasher.py +++ b/tests/multimodal/test_hasher.py @@ -45,10 +45,11 @@ def test_hash_collision_image_transpose(): assert hasher.hash_kwargs(image=image1) != hasher.hash_kwargs(image=image2) -def test_hash_collision_tensor_shape(): +@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16]) +def test_hash_collision_tensor_shape(dtype): # The hash should be different though the data is the same when flattened - arr1 = torch.zeros((5, 10, 20, 3)) - arr2 = torch.zeros((10, 20, 5, 3)) + arr1 = torch.zeros((5, 10, 20, 3), dtype=dtype) + arr2 = torch.zeros((10, 20, 5, 3), dtype=dtype) hasher = MultiModalHasher assert hasher.hash_kwargs(data=arr1) != hasher.hash_kwargs(data=arr2) diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py index 479961776a6a0..3708dc7065ba1 100644 --- a/vllm/multimodal/hasher.py +++ b/vllm/multimodal/hasher.py @@ -45,16 +45,22 @@ class MultiModalHasher: if isinstance(obj, torch.Tensor): tensor_obj: torch.Tensor = obj.cpu() tensor_dtype = tensor_obj.dtype + tensor_shape = tensor_obj.shape + + # NumPy does not support bfloat16. + # Workaround: View the tensor as a contiguous 1D array of bytes if tensor_dtype == torch.bfloat16: tensor_obj = tensor_obj.contiguous() tensor_obj = tensor_obj.view( (tensor_obj.numel(), )).view(torch.uint8) + return cls.item_to_bytes( "tensor", { "original_dtype": str(tensor_dtype), - "original_shape": tuple(tensor_obj.shape), - "data": tensor_obj.numpy() + "original_shape": tuple(tensor_shape), + "data": tensor_obj.numpy(), }) + return cls.item_to_bytes("tensor", tensor_obj.numpy()) if isinstance(obj, np.ndarray): # If the array is non-contiguous, we need to copy it first From c37c0af990ed1f3623448b82903c1ae52e84cc05 Mon Sep 17 00:00:00 2001 From: Jiangyun Zhu <riverclouds.zhu@qq.com> Date: Wed, 27 Aug 2025 03:31:20 +0800 Subject: [PATCH 047/112] [Misc] Fix comments in `tests/kernels/quantization` (#23675) Signed-off-by: zjy0516 <riverclouds.zhu@qq.com> --- tests/kernels/quantization/test_awq_triton.py | 2 +- tests/kernels/quantization/test_cutlass_2of4_sparse.py | 2 +- tests/kernels/quantization/test_cutlass_scaled_mm.py | 2 +- tests/kernels/quantization/test_cutlass_w4a8.py | 2 +- tests/kernels/quantization/test_machete_mm.py | 2 +- tests/kernels/quantization/test_marlin_gemm.py | 2 +- tests/kernels/quantization/test_triton_scaled_mm.py | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/kernels/quantization/test_awq_triton.py b/tests/kernels/quantization/test_awq_triton.py index 96797e85bd125..9354495642b28 100644 --- a/tests/kernels/quantization/test_awq_triton.py +++ b/tests/kernels/quantization/test_awq_triton.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for the AWQ Triton kernel. -Run `pytest tests/kernels/test_awq_triton.py`. +Run `pytest tests/kernels/quantization/test_awq_triton.py`. """ import pytest import torch diff --git a/tests/kernels/quantization/test_cutlass_2of4_sparse.py b/tests/kernels/quantization/test_cutlass_2of4_sparse.py index 878f66647e19e..ae61b3b3a28a8 100644 --- a/tests/kernels/quantization/test_cutlass_2of4_sparse.py +++ b/tests/kernels/quantization/test_cutlass_2of4_sparse.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for sparse cutlass kernels -Run `pytest tests/kernels/test_semi_structured.py`. +Run `pytest tests/kernels/quantization/test_cutlass_2of4_sparse.py`. """ import pytest diff --git a/tests/kernels/quantization/test_cutlass_scaled_mm.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py index a15decdf6f827..65320509e173f 100644 --- a/tests/kernels/quantization/test_cutlass_scaled_mm.py +++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for cutlass kernels -Run `pytest tests/kernels/test_cutlass.py`. +Run `pytest tests/kernels/quantization/test_cutlass_scaled_mm.py`. """ import random diff --git a/tests/kernels/quantization/test_cutlass_w4a8.py b/tests/kernels/quantization/test_cutlass_w4a8.py index 7832f8179d0ec..f659408efe8c6 100644 --- a/tests/kernels/quantization/test_cutlass_w4a8.py +++ b/tests/kernels/quantization/test_cutlass_w4a8.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for the CUTLASS W4A8 kernel. -Run `pytest tests/kernels/test_cutlass_w4a8.py`. +Run `pytest tests/kernels/quantization/test_cutlass_w4a8.py`. """ from dataclasses import dataclass diff --git a/tests/kernels/quantization/test_machete_mm.py b/tests/kernels/quantization/test_machete_mm.py index 0e09661c955e4..50584f3f82d4c 100644 --- a/tests/kernels/quantization/test_machete_mm.py +++ b/tests/kernels/quantization/test_machete_mm.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for the machete kernel. -Run `pytest tests/kernels/test_machete_mm.py`. +Run `pytest tests/kernels/quantization/test_machete_mm.py`. """ import math diff --git a/tests/kernels/quantization/test_marlin_gemm.py b/tests/kernels/quantization/test_marlin_gemm.py index ad077e0b94732..0be020085bfa4 100644 --- a/tests/kernels/quantization/test_marlin_gemm.py +++ b/tests/kernels/quantization/test_marlin_gemm.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for the marlin kernel. -Run `pytest tests/kernels/marlin/test_marlin_gemm.py`. +Run `pytest tests/kernels/quantization/test_marlin_gemm.py`. """ import pytest import torch diff --git a/tests/kernels/quantization/test_triton_scaled_mm.py b/tests/kernels/quantization/test_triton_scaled_mm.py index 24245663fb1d6..d8cfb5710dbad 100644 --- a/tests/kernels/quantization/test_triton_scaled_mm.py +++ b/tests/kernels/quantization/test_triton_scaled_mm.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for the triton_scaled_mm kernel -Run `pytest tests/kernels/test_triton_scaled_mm.py`. +Run `pytest tests/kernels/quantization/test_triton_scaled_mm.py`. """ import importlib from typing import Optional From 9816b81f5f9f85391dc30ae5f48185542dfec2af Mon Sep 17 00:00:00 2001 From: Isotr0py <mozf@mail2.sysu.edu.cn> Date: Wed, 27 Aug 2025 03:46:52 +0800 Subject: [PATCH 048/112] [Model] Enable video support for InternVL3.5 models (#23658) Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> --- docs/models/supported_models.md | 4 ++-- tests/models/multimodal/processing/test_common.py | 3 +++ .../models/multimodal/processing/test_tensor_schema.py | 7 ++++++- tests/models/registry.py | 5 ++++- vllm/model_executor/models/internvl.py | 10 +++++++--- 5 files changed, 22 insertions(+), 7 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 4763f2281d323..74f3a9d1cdb56 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -627,7 +627,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ | | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ | | `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ | | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ | ✅︎ | | `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ | @@ -701,7 +701,7 @@ Some models are supported only via the [Transformers backend](#transformers). Th - There's no PLE caching or out-of-memory swapping support, as described in [Google's blog](https://developers.googleblog.com/en/introducing-gemma-3n/). These features might be too model-specific for vLLM, and swapping in particular may be better suited for constrained setups. !!! note - Only `InternVLChatModel` with Qwen2.5 text backbone (`OpenGVLab/InternVL3-2B`, `OpenGVLab/InternVL2.5-1B` etc) has video inputs support currently. + For `InternVLChatModel`, only InternVL2.5 with Qwen2.5 text backbone (`OpenGVLab/InternVL2.5-1B` etc), InternVL3 and InternVL3.5 have video inputs support currently. !!! note To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM. diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index a604d11f0e769..74ca10d32609a 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -286,6 +286,9 @@ def _test_processing_correctness_one( "internlm/Intern-S1", "OpenGVLab/InternVL2-1B", "OpenGVLab/InternVL3-1B", + "OpenGVLab/InternVL3_5-1B", + "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview", + "OpenGVLab/InternVL3_5-30B-A3B", "Kwai-Keye/Keye-VL-8B-Preview", "moonshotai/Kimi-VL-A3B-Instruct", "meta-llama/Llama-4-Scout-17B-16E-Instruct", diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py index 79164f02c3398..2d8cd49edc73b 100644 --- a/tests/models/multimodal/processing/test_tensor_schema.py +++ b/tests/models/multimodal/processing/test_tensor_schema.py @@ -38,7 +38,12 @@ ARCH_NEEDS_EXTRAS = [ "MiniCPMV", "PaliGemmaForConditionalGeneration", ] -REPO_ID_TO_SKIP = {"nm-testing/pixtral-12b-FP8-dynamic": "duplicated test"} +REPO_ID_TO_SKIP = { + "nm-testing/pixtral-12b-FP8-dynamic": "duplicated test", + # FIXME(Isotr0py): enable GPT-OSS based InternVL3.5 model + # after support PP for GPT-OSS + "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview": "Broken model", +} ImageInput = list[Image.Image] VideoInput = Union[list[Image.Image], list[np.ndarray], diff --git a/tests/models/registry.py b/tests/models/registry.py index b34c6f2e5dc84..20c7c3af67764 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -422,7 +422,10 @@ _MULTIMODAL_EXAMPLE_MODELS = { trust_remote_code=True), # noqa: E501 "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B", extras={"2B": "OpenGVLab/InternVL2-2B", - "3.0": "OpenGVLab/InternVL3-1B"}, # noqa: E501 + "3.0": "OpenGVLab/InternVL3-1B", # noqa: E501 + "3.5-qwen3": "OpenGVLab/InternVL3_5-1B", # noqa: E501 + "3.5-qwen3moe": "OpenGVLab/InternVL3_5-30B-A3B", # noqa: E501 + "3.5-gptoss": "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview"}, # noqa: E501 trust_remote_code=True), "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501 trust_remote_code=True), diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index da8ad8396725d..b09ed7bbe72a3 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -855,9 +855,13 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo): def get_video_token(self) -> Optional[str]: text_model_type = self.get_hf_config().get_text_config().model_type - if text_model_type == "qwen2": - return "<|video_pad|>" - return None + video_token_map = { + "qwen2": "<|video_pad|>", + "qwen3": "<|video_pad|>", + "qwen3_moe": "<|video_pad|>", + "gpt_oss": "<|reserved_200000|>", + } + return video_token_map.get(text_model_type) def get_num_frames_with_most_features( self, From d696f86e7bdf23a6a4c212fee3522a589a460b24 Mon Sep 17 00:00:00 2001 From: Chen Zhang <zhangch99@outlook.com> Date: Tue, 26 Aug 2025 13:19:05 -0700 Subject: [PATCH 049/112] [doc] Hybrid KV Cache Manager design doc (#22688) Signed-off-by: Chen Zhang <zhangch99@outlook.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .../basic_grouping_example.png | Bin 0 -> 24096 bytes .../hybrid_kv_cache_manager/full_attn.png | Bin 0 -> 4120 bytes .../hybrid_kv_cache_manager/memory_layout.png | Bin 0 -> 63113 bytes .../hybrid_kv_cache_manager/overview.png | Bin 0 -> 39501 bytes .../hybrid_kv_cache_manager/sw_attn.png | Bin 0 -> 4560 bytes docs/design/hybrid_kv_cache_manager.md | 245 ++++++++++++++++++ 6 files changed, 245 insertions(+) create mode 100644 docs/assets/design/hybrid_kv_cache_manager/basic_grouping_example.png create mode 100644 docs/assets/design/hybrid_kv_cache_manager/full_attn.png create mode 100644 docs/assets/design/hybrid_kv_cache_manager/memory_layout.png create mode 100644 docs/assets/design/hybrid_kv_cache_manager/overview.png create mode 100644 docs/assets/design/hybrid_kv_cache_manager/sw_attn.png create mode 100644 docs/design/hybrid_kv_cache_manager.md diff --git a/docs/assets/design/hybrid_kv_cache_manager/basic_grouping_example.png b/docs/assets/design/hybrid_kv_cache_manager/basic_grouping_example.png new file mode 100644 index 0000000000000000000000000000000000000000..185f61e6a3edeefb7f3c2bc5bcb942af3408da0e GIT binary patch literal 24096 zcma&O1yog0+ckPnB&0)-kPd0-ZlqJXr5jYbLqb}*ySuxTknV2j?(To#cfWhb|9<1% z|DNH{!}jcb)?Ux^%xBKI)+SJ1Rt)Ji?rR7Hf+Qg>@(BWYAqn0$!My-~zd@zYgg~^n zBt!(2oRSYx?6ok5pSrg={%#*)er%M&lz1az`BCY^hl)292u+%32qEF=YHD6DUW_d+ zF1{06SbTg`YwL#p;q_(fBx!5yaV90j$!ek1(RhjZyC5pG01+x2G*j#6^cn~p^b2nR zQ9{$De}|Cu$ZR$?HWpi49VZKw-Q3(*Frjg<kdTmu)A;k6nqZ6+F}PxowpBQzWsGof zadp}}4z{=7zkh$-54G-Tx!4#ZI`@2?7dJFxV`D#ccS>4HG-~>~x;FNv%f5a4#u5#O zgT=|oiG_t_=Qy2iX)csk3AIQaP+5Rj*zD<4=DfDP4h{Jc94r_A!P~r&$Wy}Al{>xT zfB#<jjR)^i^5<{C2@p7g@fINZ`4x`E=KpkHEiEeqiI<mGT3Y(Mh`6+LaejVzWo2e! z_LHiTlG63{^@k51)HF1Lf`X6<ILXL-BO*qB{qkb+{l;W1BO_yNeONIDLPbPFL&MB$ zJDT}1+FMXiFd-p<goH#zMTG<3`j@Yjl@%W!-^Im6V`C#17nge&I-IY?bm_rzo9DzU zM|=C5>uV)tWhxpPJRF>e4hgbPpFe+gaH#e2`ZbWi{0bTxW|)$gIHRFqc47h(A`eET zq%=Y4lbATCCe0iX6BCn`CU*lnFf@dWjBIRdoXTnu>+9QjgwDpw>i%$B)7q*dCl`L| z6Bixr_HY{$5%KkSofIa#`wo1os%lYwzP!#CO!UC@l@&cby-`Pf1c-}^OM6Gh9rWqh zSy5r(3&`C3yt|tlh--CC%^g2~7Rg<I9K+<~B$yNMed}MK1zX+kV$;$XXlS@Crf3mi zm_MJUrV{AtF8B3Gg0P+w3+|ldAD^BU78O0UXg~|zKRh5HAdr)jn+&H8{u<E~EX>b` zgM-`N-i~K9__N*{1v|X8(H{rKfk7#ojd*@}85<R)_~}z7;`!Mb20A)OBHRqE=;~^g z4<o;hDt-+Q5BK$%A9Hkvy-Q*;di|zzgq5_(0u2ofA0HnC0i+dp1H$rMWN2Uj6BV`I zZU>z+yS%)7Sn+(VJDf2_TUWO%GgIHhL=1UnXGgvKvv+y<aiMZic6K(1Oj2UvEhQ~2 ztw=b%ZfkOIu*KEkqC3g{Tn$1%M`+{z?X8QEk&(K(x~8Tk6ciL59v(@=4<a~%aCRmp zbaZr$R<{O!e;9UlM-h=uq8lX@mDjId(`(hEF7kd|6?tm=^7->$(_V`4xjAYYnxecs zF=XfaYeo!=C=l6#0+1C&Wo1WR@E8~ve!jlBGD)?jqa+j*xy&X*=_^Z1OCZB8_h)T4 z2lR}L+`+))4H31S_4G1ALPBzLb8Bi?xw-FR=`?ReI6NPDAVfSo&7O0+yStC7_9v@d zrnMLp(m!KkYbz?y(9w6(4s`oU3kxH`H^;<iSLnY&K^d+z>K7+N_aPD?8J(Nc(9&`{ zn5)Um%sks1Bt?aLh+=1BGdDAXhlih?nGu(g8ct#_D=I?ZHMFsjbLIls;mOccRAez# ztiG_YFf}y=Ze1}?*2>aS1jYUSIwB=y5yY~+-6t?`2L${(=K9)NM^DdNR8#?+x9I5Y z?d_+VgLr+v>gzdVWrxJV2|}egtp?@;F(`9oQ@EV)O_%)J;SO;FMLwyh@Y!vnHo6}F z{oNZyndMpd=;Z8ta&n?qp(c5@d2le;-+$LcmPOCa!=nW<BDm{QVx4B4bzo4Cijoo$ z92f<dREL#zzeh9GQY}vRTf2#B5mC_>kWZgJjR>={vVy^@(K^r$Bt=C<H8;DNm{9r< zy?XUZjdp5y7)9`B61xp37gc5DjL}#I-O`*KR&l=Ppn%UMu}gwXMnpm?DJc<CU`x2z zo35&;h#1847Wlid!3)CDxcB!j5d#DNXCYSBnovCU%=y&RR1iQ=V*r`JEDQ||nbucT zRmI28Jjb`e&#&wL=1c((6&?KrL|jIOoG>&zd~0RpCdGNL!}IB(dvGu%B_))o$LAFS zC|Z!1XwU`4#h6%Fhi8uuQnLZb??8}0rbSImC?UeE9UX-e^#CYtXlNMS%C4-83=coL zIoo=OI-0E#6B2q6IyE;}SXe0em}YKnuG8dj5dOlS{O<N*FF84RJ7(wbFqPAG^YUOG z4cg4wdUI=w65H|q`YRe*1k^GLXgqQB+WLBWP>`R`w_I<#g$zpM<;#~hr;*}h?Ck8a zva+;T-1=vt-&^Z`{P^+n=g-Q@$|y>?)e|SBsZ?Iq{ey#We9ligI!pW#c6R3=yi!t9 zP!Q-BzrnSamOR$}gn~LS1r6)V>3T0XcRWvSetw>i&#fL*fF~a)is0~Y0JcwI@3_GY zz(KcHnGCa8FPPfcc>ew6-Q3K>G6!G)R9G61v%ZZ@oBQ2mb~f2JWJX3tfT?>^C9dEa z-Qk4p?(U<<0K-j8Ok7>>w3{4q^76u0H*oDil8lXyPnT#099qv-lC!hNo#OZMfFv5v zmDw6jQ&myPMU<14o|u|?)VAo|j*O2t1p@(n!OCh;EGa9CbgsCsxA*2^PnDay>9H3a zX3%N=!IqI8$_!9)JHu=|mz9NOq5i7@t)Zs6`eK7EtQ7-*@|Dd^(S9R6y{~q5Ahx1e z2}wy{Y)|wU*w}HP+3-=yZmxC(gRr6y^5NT`fNA>u`!~3y^`D1_hq<{Frv~W-1?mb4 zqx^b8LPDaVFdA|%Ahhb0WT4R{B_)ZIfxsGrQa7z{Y;;jpR(5o}GKDWKE<V4y;s<$) z75obF76-@A-#-$s_1LKyl=EzrNkF8sy80}@oKl^ZAX(`*sHlJc{{8j)HxvXF4ec3Q z0|L5Ddto3G6BE*jEb#8K(b3UgIPhKcK(3b*7msX-h=>@1PO77$^AH6K12Z=}n=#74 z!Lh%;uhZ-V1=-%&=>-^Zkvrk#<pt6-OVvLx&}^~MetYuzd<TPuX0KGcX~Z-vER2xf zLj*=cVxqUV7jTWmCdX*+Jb)XHhsgl}upkLJkN<??-CZ4l)OaLpl7;cUEG75w@R*yM zJ2*J-_xG2OkN{l|bg*bN5>ir=!NjhiA$lsRK%$;Vaw%}3oWeqC28K50vrU8UFsSFg znVgbhGL)?T=@SVJ&E4rnKbVdyhW@UuuDLlbHa0doy3m*y`@uw3P|%=zYG`P9dwYW% z+1uWRKzLlvd*<eDCyP`8B{DNJ18FRl#_8$lSxuduk@0oCCjuZx)@Dg&rjV2r4K1y- zu5OyQx1gLH0|!U^>o{veLn9+bX6C4;dytOB#l?VxJQ(2O;B0Jd+00cJ$T@xbgicDj z28Iqkh)u5*pP0zyU`|d(2EJ}6g}b|FW?oL32=$Yq;^@(MgYA~}LOoh=S5R>9l^h?q zDUgFu5Nc{_&~2FY+Mz+Uod3+rGZ7H*ijJO8>tSPLOyIPofTXv!K5ma@y+t%WTxfuJ z$t1D8<8zD0ADo`Pa^xieT@uuh=i@!+Mqm^b6&0uB;X(|Js%T4pRb}Pa@82t6!nV2J z`NhOc{`s>xQK0zsE5H#n)iNFJ78m=cYS3t+C}hknEf)ag*q^N`({9Sm$?@x0`NFZ_ zkAw#@2Lb_619zte+7~nPnW>qqw6vj#2>{30I-9hHh8xg4K<4!v^?@okGc~2vZFPIE z4q{vY8Xx~7h~fJ({+BO>)9S``0U^LfM_=9CjE#${GZ`kJr>D0&>?|&(1;GD}#o74= z>L~+YQ1t1Cx6;<u^bjIa(yXGQ{^@B|F)=Y^<%@Pd1QC*xqshxQT)>s>?d`$v0RgP2 zsJOhm1ZCw4=vi18vQZs=0umk@hziJzV5X5je<&CjY<~N^iuMNV1$-BcO7S&KmvC7^ z7)f+!6zHMcj)x+35@eAT6(=B^Kc84-ML&N0YHPdP>W&wByfaP-2*Ulnn;kn5kyk`S zMR74As}z`bv5tp_2PlYX{~K};NC1x8Yy{zT<>fy@LPQ)#^FV@P38Dg!6BZU`X>Msr z@$=j03dV->0bm7&ow)+;fQN^N+hJb-we$CHK}E%=1&+h*?fjAwUUO^<dC@P^TH4w- zH#bzmzJ7k%T3Rp?nYp>5vm8`(babE|0I`!x<9oix;9wfShl-8nXJ;euC`St$0GS01 z4S=4hvGJeY-h_yVlcs7E0v>@hhtuyOn;iit2BRE-prEj;7r+l!Hbu_o+i0Yuq`TSU z0f&dybaW+PPKe+t^*a0?L!;(vtrX?u1@N>%zYP>g<aNDV=?FlB=JdGdFk03W74-+y z(S-|4Bp_}NQE9xcDxhROeLDT+{Sp8KIwodyb#-<~xnVCl1VTut3;6bXYR}Vkcm{@} zY>6m`!v*J+_E#W%-@Qu#Uk)t@6cK<-fEBxYd%x3gDaXghao)ZKDJqK02xgU$QLqYd z-)B^gh`^znh4VQ-KewDI_qL>4SX><a!pK+w_(Nn?Lt`Tt)1{oXjSau^>3VD|7PKIM z)!JG%h?fWnuk$H1q}+Z_g`dBzNcCAaadB|mx9kIs49Ii(k@5>~uf}(8-n^Neodsi) zkdy@I4JI=XK+bR~@Aov;bkk(Aj^DpW+YVHPg!DT@qgc+8MG8tv2K)NH+1r0~WPBS^ zQ*!|t?C4@-Bs!qz0LKsJ>#~SJ*U-|^0%A|h5gz(QggD?PA`C3EH7E~cVw!w+0uBYB z!Jqt-3xEangYW$D0|H5(PGO<}w+|qdhhTYOAvr$Y>0(!zB{?l^d1mJAA~HPu%E}6$ z4v3AKIy$L8e~yff3aU>(I)2vI7xJ)TCyEe<cqJ#}S3ACjq!$<Si1+)~)ztwmxa)1K zr$@}d;CXj>046RhXIB@<ONbX3IRIlA$lvw#?6NX|GR}ZrRDXqm{AzF4H!%2aU8d9G z(&3Mk#^+vHQK9$wbNbwz9JBF&04f;=hf{AP`CCN)dtV5|da1d=W|<G1eC`bt2Prql zKoa{A$UYw*pSJPYv9Z;|Ll02DQ2ali{F@w)R8&FV=Dvb2D=udF@S(h_ire$iO;vRo zP$E$G<6~oBQh;n3;n>F3F*7SjOC!{HRRQpBY6=+%2{d+*wPvT2RRDckLn+LRjDf+y zd*j=@`f$)f1_o=O!;M}J1CZ?N>uYtr0t&|R;r0R)5)kB8rlv&QHkOw6*C(WnbYx_P zW@drb%Z>I@GBTl+)?Y7nCvljK--C|0yXzki0L0woRnaXT5JRBbgM0@<a;eSJ#=@e1 zXvo^iYOu4@_mM`SXC8Fk_36337#ct#kq{9<e+~%^&C1LKZ@^g}-VOj-sjshZXKxP$ z0pbc^0v@}y2tUW8=)vJ)({i(OR%t1p!JiO-vl3AhP>}e91bAp6!1FXT5&_$S2nY)s z02qKp#s?k&k*Wijq`Pa0Pf)l?>~^^(b`>u$!Dmm;hXL&dsMYpmr5)5fyY&Jb#MI18 z&)7KE&Plt<7^#8s`#x@%0|N0&*2l<xAymWuSUPKKYXEJ4TmirW(I%l3<mRWcwXm>w z2?4S`vIEq{XecNwK!Pf%t7kFXUp;+4K0B+cu0{<Msm}5dAz@`_M|l1EPghsOvIQQS z<=f6?&?SD>CB(;rq6OvoN9DZi`1lyu0zcz^0j2(dnHd2QQDi=URA53EZBdjk1Vjt~ z8vvaU^GzhW>CR3=kf&f^`Oc*1fgoKR934GD_5i>ejR!mou*~gRpf`a40u;lKE#jky zh@FE2P~FS*Utd}U-2+k+D5BA5x3gV%u)n|1=Gj(SN<W965+ARoqXV$wXn!AE%LEa| z_Tko^77GOP@pwfT1Q953z-5B588UkTf$i_^?*8@bv$b`YKbS_Kjf{+qBM0$Z1ZC6s z9>4`alTG2WH&=>2J3V~?=~0mcvDVSy10YLHU6_~G-SPYa{S5)~1#kdA1Z;3#%6&3W z?dkA=BCGRtHYoTU0tr>MwGIGb^YSRY1vtL00v@Re+Hbxo3?$wCDV&gBRE&rg@HpTj zfiZy(1HE8lZcfr|2}B{?c#Xw$Pnw`qENw(&q@;wz$?-90KHu!@Lc+u4WMyF>VCK_n zk-C5dBSHe8?jgr{ZDl1eH1y*5SitrDiBoe~Sy_5I5g}n(NC=|&JOol@JV=5b2*R%0 z=0WbLyj1^n?J30>0}T!2(a_)^If_Nk&`@l2v`C%3ogH9|L+$OW>+6O9UjXwaCid=F z0m@ZAYJ74M@Tk1}d=8(??>;_<%WZ^MSpF&#OiWCm%Gv9c2nbTo&+S0-WFe!YLn`?t zAubMPxuL#3eFbzx@QT*Pe!>LG8~Oz?Gqd8ykFOv<6B1?@7fUiSh-qk=zHRjJgSO95 zS5?*Ie#e%G50O_<@qgqmfB_BwD4Jb!-?5PqA~LehzP`DwwK;%jpuB5qYq#lvmf8?I znE_feucV{}@Dw}j;I2$vSva3!jT&lDVq|2{R+2)_7rSqqn<^{6fvNv)c(}D?Vrd!D z_#%vc9mR(a)N)06dHS+2<hi0nMMZfCj{g1?*zraOKaoTL>=|kMqN=Lu>e}@4r@XoO zE>M4fZ3O!pe0U23;}6xwCkAud0*JrFp#(rBR4lC6*jUjfp1bt?{Fyd%ENpCWa^05| zp}t?BjhvmF94t0rFNXg3A<*<q>{azi;C&1t1n2|-8WYe)qqXNi2ixWZtVUrIh}^N+ z*#@h5HoGe@*MOlxyll6ICa0&Pk`lqd03LazjCYXG)0bvv8v<<HK0I0LuBomD(gc9( z2m_?r1lbA}+!mkf<vuE}%by^AHr=pS0BnKB@^KFp&?|XW)v}V3kB{t~yMT1Ub^U_8 z@J=r&u>!>+3c3-H4ETw|qgi6W<pA(9(pvZT?`IbL611(2jSDK(j~_PxSZ-eipb-9g zTIg(QY#iagygFJ&gC3oprsQTM#SCo&5D*m`8wQk$g$1|!Eeiy4;M!!rhXo5Orl$7R zhX`msK=|1|e0W<Ge1!n)E1*9Bo&l2u`i0(Xr4a=M#rN*KeOGV*m=(cTbV6b?Kw)Bn zNV~@xP$yL$AqoDZQ7wgg{W{a#11KKQuuY##34spF1umAZZa5KeHxXfBVQ-)WaI#wK z06zT)42+45C4q}fNui^q4Y?ErtU3u;F<>Bmn24#$Pp$0E&dz;XwwOkWLbF$~F)>{| zJx)hUymr&Gvk56FEkJC`p_4@9=jWSRSj>-)e<$hzL<{&-0CoUz`SGKt(SC2_(X1LP zK0cl!s1G=%0OBS`N4<%9Qn(#wW@r2R`>9CpD~=APN;Dtt>ey_Sic3mDyVua5LFxZm z`69eyOwb63zKL8CTS-F$j^Ix~zk$nkzH#|*=V)xa86W=^T5xr3O<hfGL{*WHUZ?pz zs9L@yevf-_x@k2qjesZ$@b@Qy`=qJKZPXVHq|IZCrJS4`GIE%Y&kJZlP{Y710;C{_ zNL*X{9Uyr?BcMV9Uqh$XG9fH%i0G9<$x$fVCLIqC{za0cloYV!(pW99@bHoV%7FF? zCVAuuW~08Yvr`06Z&(xpBDmAlE(CzVz;;wnP~hWh1^GY<$MoWCYI4#@U%$PtkIQO~ zMNe-fODr5{tgKSlHf9@bZEfH-2*)s}l?$k<V!xx1CPS~Yn)mP0&&kOFGJbMmVuaG` zb;y3{#`=0sUmwtrL4koX$s8x)=6Ch6on2ir@gD>Q1Oxy8M@5~1X;b>S*xif<=S!zq z`}Ye6h$bjZU|b-@7v|;~&sCd&ECEDvU|;~S`t7waM$t{+QUN{~6dDRz*VGi&yLpz+ zHsgQQFBome(BT6*kT_oN9vLZ)b=Rx|Z2($7$VSt;y*LJ480naVgo6D1p5b8zK={eX zL>{I5{rm#K>&Fk^4W1nzBVN;;viE_QU!0#iIXO|tCie^uK7gX;Lwht`0$nAmtZbRG z99T`l!r#NgZGfKxD3>tKIUDUPKzpE+*Z%&MmzV#nr>9<L{qFWs7<GN7!hi!`RzU$R zpd%t>^A>POM%ztkAk~0y0)JO-VMd5AR2aZAGgkpF3H+6U0yAyx#pQkq^gsZgpzVOB z4k(dW7(OxK?c+le$Y$XCrl+R^)a7D3bPTaQ^CuEFFfaf;;a1(3NF20sadB~B;g?3x zlGEKmL9b7zNeIHjfp-OHG5-`QG;xG@bOX3W;K@W1Vln}uwjF?;u)e+yP-EgEzp(Ii zi3qSAe~PV<!hHg33t@ZG|GEV@4;T;s^j0LLkpOBoVEzEUIXRuzv5l@nu1i56K_P+H zqiO;FLzyN>Vh}(lCypB|#9*|ZQUGB`$mGDh1z88k)Au|X2?_YuuSrmHbF*7oT0oip z!1bpAT$TI%`~3R)c>GV`^qKJA%mqb7MWv+_N15Tb<@9$w1d)860?>khLxE)s;P%QZ z^1=b_3Ydy}roDp$fMq{FzYNc`3i(vPrEqXKKR=3)fd26XbpvR`mwd!z=v2hS#C&`n z+tY6KmCVe{pl9B?x@3hrrkR_10<R3P$&t9X<iOzsLlwR<C&=7yZf-t4{?g~jD(UFx zXlG{!JT-ZI7b@LpLr@=g%iO1Xj?jH(#!K61ii+bP_p%(j`uh=ryF|sr##PW$mRR2> zC5{wW`dY^b&nj&azI!X6tJUV=-WBu)+$8Wr!0TC2a(^QRn0i_UH8}qP!2WPgPkwRn z{_bvMY;5jOvC_ZwBB+sOq@=clmq=g{jA+99(pFaJ-#Xat|9N5Q+(i~-)e8v7a;m&* z-}|-n;H2?I?eCQZ@F{&Tc18GoxlH+|P(&!lxpUOdBYEK+LjQeGqtbkm8tB8aCC3dz za<lhnOtjxlCF!uNv;Jdw4J>nYxDMhq4&~M7|AnSv+}-iHe|K<@;)Q&K8!aQVt;CkD zxgpOMBN%f9+Zg-2qGyQzVi50nIqwhgNVDMo;Q%R%O^(sz!Mp3nl^A9FK{~6BLC{w+ zBZG#94h2^yCfO&aST9j*EX@lFO9@rJ8f~!t&NyMMLF!QCyo^?zK=eS%msXH?6(OEk z6BIErGBf5)IHP)<mQ&DBlU*D%MeQ#G+gg=b89PNSDh}Q?Zl<}Sgs28d(KI#Y-+JI> zIFEl&J2^Cve>%Ql3v)blMPjczJS4!0xzUS|_}=-I>zl#+Qt(3u!?IiJ!sEzD|LG$> zkDyU$SLT5uVJkuDPz5O`tC)^Og>O5)HT+u29etEI3mY}xa{EWaK_jKjGs(n-FDL0G z@hhK}M~baXu~9LFEr^A4ikpo$?@O8^bH^lYKT>Kswcbctd=&N(Cp+MD3RN9b(pvf? z*}I!HjG{q<O91CXTd~;o?Ey^#0f+i)APy7t8I`BZU`aI=E7@UF3r7401D{hK9jfs& z)x`o8`o-d@7QI2C7v9%bH%={QX?n$5SC4Ek-*4^~ME!tw4sNs`Ynbb(nU}K635;Uw zQ>Lrq%&4TCy7S<e@|}&2-CVtc!sjmo3r0<E)+q*3IYU)k{67jR++@bcek6ZWjm3nh zQp_Fl`*Y!GEBx^pxIwj=llaKykj6s%68i`5`72$%vwM8JdubV?(}TOR*b>h8H&6ay zqR`9k*K-U+?E`9YQ$qfdmToXu{m0uwjNjm3x$%%L5F>^?GMZ3yPF!5{QgcR4+D1Oo zq_9kG6rAzh6t&jvFRaGdv~H}&+|vt5Y8)~TGcca)xEGRwA=&V|U7-XW?hv2LDXS~1 z>L`p;$_rZiRIZAgT7NSi!aMr)Tk~U~1@4mj{o^b{bxzGYq=Nk75*t%2%)fN>CHuO< z=4Mt;r>q0~?;D$%@|rVYtlUdV%}mX#<YWhLE_q+qlmQ6${>`X!bbp4>-(zNBj)#k@ z+0*nUrp7b_grdei6y)ww+V}6mYp~TL;LRRhq8#C3d(RK$Px0IJi<e7T?BA28)y|H0 zTxSC#@uultKn6aHYF5~>SzRE$3!nC&R*W^Xc+bQ*I_RjW?XD<d*H=J_+%Dv%M%%IU z&LMJ@ME7NuY02L31Un1ioJCnK6Aq-@eVhpq##fAdTw_AMcYvP9er4sJQ}j7Nk)LdH zcMB5Unn!ttgs5o*-!iLO^Dgg{yQSu!K|UW^c1YzpGBSSJIpRFkKjd|(dYT!OU_!5e z$7!l7t#>k~+!#f6+d`jzh+w2j+&xX0^#cP)(_KczT73!OAZXm&y;M<^YRIkzeS721 zM3ulA`CGe5sxH1m+L;|UB|U{mLdJLm-n(nrvOH`!=+z+mfQNh18rC<p+}hERsCotl zI}ArdFB$Sk*;$k}j#IR|$YX{HQxi?h4BIvg)eO<G`cVmDlJR$XyJoo=9ZW@5-SEWF z^gPZ7XGUheZ44Z4NDWSNaR8vdbwi*;{XyJ{QqRGOMeAs&uaEf#?T1XyjAh&PCOIYK z*R53^rFh^{YbD{}p#JD(P3_s`VR_THy@sc&?ZRCba=*aYhL4t(w2y>ZA|IjO=lr2C z%%|?JEdP+z<TBTH7q(<+YPqtufqSxegSU##bY30P+tjEbnMco;9T<sryfb-$L*YVD z_Fzh9m=AY+hMMFuub?^4wY!^ieN%7t-t0#*U!!01JKTtwMHzX;RB2&eVX%dv+o5BW zxx3bI3yU&mb3|iEUP7`rm3#hpCF%;~)LdeatqI<k#%S8k4zb9e>{Px+a`rYd60VP0 zN~Tv+Z<H)J@thBkmS+>*JhAQ1uh}d)mMuM&$qmn*FqeYc;S4<TX<ZnWy8QF`S1j)q z6eP{fv%aw<^zo6U^GZ<V9*;J5WgtgvU5TODt&UU2?P~S(1<l%W%-!{Dz!7mpZ$#>E zJ{0}$Y1lDJm*G+I$t||a8*ztQnXUX1VaE$~78kGYNC61zwj9L|kJQyv&`RHlo6Y|a z87mC!o}L@(4{?n`zZo;G;552mWFeuVd9<?9^YU)}Q^^V|gIHsWzU=4Wj*0X4al7CO zl<{}n{2AYnyE^2%)}|POv&7)2wZDP$?j#cvY;f?Wk&dsfogt8yx1;g_yNNJ@-a5QK zFPd2}nQ2uthEw%2#Vp=<c6qPWad4=`(I*Q6bS3Ab8aKTZt!hA?X5c8K?jO?_`pMxj zr7SJ-CF(&RfGCUz2^Agb)9OVY)Ne+r9cQWi%Xupfe<HYN^y=Gf1Rz;Q*t(Zg(sH#u zc)#IFi)Edc_`SDha6W9XzoCHx0{Jsub8fHP!VjxR;C{DqM{(CB<l_c`AUo8$H>r4D zA{_j3Y<<GCWHg|@W&gDj&)?R%;^xGh*m)MZoH$6iaFp`4S$*ox_BiqtGleWFEJR!q zZl1nspKxn2@2D>b1AB;7(DCX>C-nE{RyU+O?Nr7Sv#y+&0I*)vZDL|TPRN>;!7m{y z<dsIr%FDpswq8EY1p@;E@yczyZ*QOgC<WUluV|+<!mMx9Wi@ZOxFrMy`5}T%`qAoL zaMue+W0Nb)u){tPF<GA|il`_pXrGGMlDDhBPBe#kTAMu_&>l=iUnS%3Cr#5Wl^d^Q zU^w~+ubI(p4g30%^EjUbR>B6;n_3*rG>&qe?`5gV4YfTSMIOy-f;w<}AO$dm0tnR$ zh=&N-I=QQuyJsZoEeRX(+U0}l1eN>xri`SJZ!;Y|#Y8uETV<oHhGcXz4ITO1xNv@v z4Ru+5M7pUxKndcy4yP7`cn-nU!GK@C`%Cf*-(bHXJCgm0^^Qo^sHED^h64j5D?a}h z(Hd-DpT%38ihM@VwXB2u_w7#zWyPg0+fwVRqfL#J7ytSgWxI0EcYXBJ(3qc}<9<*2 z<!eP<-Bnnnz}-Cur`eR*K-W!hYV9R5n+g^!7IFl>Fc)`|YBBdcC<q(EL)o-KK?}|f z0a=wkhlHHH(2m&6@vu~fjGAPAuWrYZSvUQhi~Zu{%)cYoW2d{hxl?~foOoJ;r^X&3 z?AYBK%uS$+jcPL+EcQl`!CtXFVGHcRbF<AKpMHUYV6IcpmNVrhs-E>Mt4FWR@8Tui zonw8m{mR0^q9_W1eEC{w`>m3d^@>7znD$4QlyDv40^jP2!)B=CuX&VZSgN@2xZttC zteToEPkaLFA(x;r1*qtf2zS)KTg0e8*2&1w3rbr9-7rgvGALg`Oik}Mn(G@lArN3b zi?0*`Q`mywR_ncr3ck@KD&&K6q>`Y*k$KfCzYWV`T6F#PHbw5K^U)|>c?x#*AQk;b z{*kFAz1A<9T0+JmUU$`0=GC#xN(B6Os7XHsmz0$w>1dAj&@RhH=cdAbe_rx@UAz2k z%%i`z3@QR=_SL1PtjZ*!%|7i#F9WTu>tjSE+ABc_LRf--q`yymJ`vs?Ln|5wKJ#R2 zjF`?qQ#gO9ypEumn&B926;-sD$=f}b=1bD$kUWAk)`@kedQKTc`}^;i8uK1=*&UQe zS}hNRjdVT3MOg*aCP@diE!gi%qzH|NlP_1K;548}P=6fpA$>*|46$EBXJ)EG7+sWW z<p+H{B+ufBe%MTL!q@NGx}jCXk&qv(G1O*@D9WV91?=5O5!%&N6l`S4{iyK9f_Mc- z@v=YV!J3Jx2!1uQ3I7{@EK(V;g}|U2jH7sWUaT>c7^|y>l@=X%2u7R4-u=U$Pr-Ho z$;YEJqhra{1<r;Kn!cJTBPRX{WyXp_MR6V#?JotTlw2CxQ_Cr*!!1Fh3=t~-?fuBO z0s#;FAR78Q4Zzn&E48XGbC?(H)#1@Hv`-XtbO(ENDW^A=RRQEIB#dS06JrZrwS|vx zSpA<_U2hQ>_^Xr3TV6h~5{*rW2XN(tFW54Xa2Zt`w1P%Itm~ov*=V6VbSwF+c4re0 z(YT~4nU#*3jgEqE@aW+k<d^>>LYk|`D7$8pp|`rEhPA24r}`&@K3~dkOOGci)28tp z&=6o~9Rt3HPIG(mMzNZb*JWhnc-B~kHT(v!7DC5fQj-$U?byis<h1%%^i7UL8#gw7 z*v-k}nw09n)f2&BUr<qDsfoV+{?TGbUr>|l!7mNit&rjuR1#=qj!<KfnRgClFc4Zz z9*dQrmS7bwi68LpH@LY(?B2<*gN;7K*LcjXCVUM{z)^oi-0I$f3dpMEZ25%wPN{tn z7U--2zFh!%PPq$|KCftDXj>U&&Ei?b#={2uxjIL6u+kP3=3Pt5pP4|>6>@T}KbM~R z`LfPNN3_E`TzLI7LEBgbSw+gx%!1nE^kH;l<n*$OTq78_&a|-Krc&3k>2dpZ5&<6C zXZ|c4_Of_%@>6C_DZlN`x$JlKJ(GcXCilWWP2tNo*Y&m5)w!~xK<GG@62!B89319n zqAa_<^=n}ZHpy6_M79!X%B2tw%n@yITiH;RS2Jc*d<raLo=fU#Z^ysA7)4-M&W=t7 z*?FYHO9Yq|NknQr7Y$Vp1Tv$h&ez<G6&F9c<eg{PNs6*(Y+wX`cX?gVy>;Ktl$YW_ zEm@hf;hx9T{`o_ywF$^6xNynMjg8yKF8iRc*Dj`c9p5Su_{deT#?K;~zYDFYeW~Zf zf>~JogO6_s@oM4TudjC*+zQIUDwM1wbQc2Hl!9VYsl$QsPW*#sn8Nn77-6d~Kqy!b z2RiCfqO0|S0%^XZ=0(uj@NK_?40)e|?pn4L^N{P$wW1)AXnv|Xe2{xEqXu>X-Ca(B z=HFYuiBKE@NW<F7W_Cs@+VSzFf()y(M$udJ7xz4JS}AwXV}1cGVY#0)LZ)ry(pJa% zkczP9=G4X(Z7bah{B984!Dp$jYzTBd8{y;Q4)+C}xjXjv2n0_B`FBE7Ra9@Q4(r}x z#wi`zsae;#$);`gRlF2-pp%i4Q&muq2)@k3=Y4qc1=xR4K$FVP6ME_9h7?k*)m-)d z0V4;yw}-=JZbQsz6{W2LmpwLBJ)Ox2XL0fW68{2qeHoV2XNt}miIt}egKKuo4IT0A zzlV$s(^+k1t!aXHc%j=V)8w~=lOC69pT)4Rion?=#+57Oi|+>dtK*y;Gss8U>>4kX z;s(<6(#sgB)&EK6ZGp3?engBGg|sst9@$s_d;#dNNk1U4dpMvI+5hQy;e4wlak5W_ z(yyNpBTHWqBeKeaJxvd`72_RYDE-z$Qy>R0pu~8z!TBK}5)$G=w~Mbc0V}sdGC~hj zQu=h#R6qp!;Vt6FVVd`T2vEXcgeB><&P`X_q0>!Z18{_RBHOsfE}|=GaA+Gd9h4yB zsfH8;QeyQEsL-{j%G>S+yEUA&>VoW5d@6F9r}S;TK+ts*rKhEn8iqQ*jUVMMT|QuC z6gMG4MkW?0vY6VQPH~*v;vQn7+&&rmFfqs2m3aD6rmrA{(oGeijJPdYfqT1of>$2Y z1f6@9##-#M<RxT%Gp>`u(D$yXM%gdS>7+NDk6%{T4Rim@YX-B*plWb6=I)W>HzT<^ z=STQn^-)1NO=^!2CtQ}iEOxht4h5vJd8?AxLs1J?!0Cm2no-i6!f7?1C-ELhiPfe~ zOIJf(8vF0o&M(Kzs7oehW;T}E>grm>(x%0c86R0Ej%dV#Sj}BC8s-ll_UCIfq?NpQ zh}Q|tj($B&{OX*T0oXexYK99$UqEFKPN0a(;myYOBhq+oKI4ZEmTFb<x^f$bFB#6a zYeW0#bv$3hMFOJ-byt}8Oq48=puSDB@)sfm6(6_lYc~$$m;SsvJ&NwKqpL+jwqwPS zfyrTp*pqmgea=O-GRPVK{4G~;+Q#->pN+86@a0AxH{NL%_M7dMD93jWX-@_e2k%lq zfT9e^dWeB+gYV_XQwYtosHv{Kdv(8O!2}Yl=lZ+LCLJ+nQAWi_In7s)58VrYtEQ14 z0+z>TS$EUnv>m0Thnul%eN^~>eVgwk`o{z17*V&S3=t5T2^Mxe5X<%co}w)8#ZJP7 zvq&BxUID*@gv39+2UfX9##M6XRz;t^CSu_hy+xoNn|ZX{nzgcGh#OmRH`*E1XkW%) zH~jhh0fa>^xhcF$UpdB*clnx?weB_iIlUh>C5?<u9PD$}D}jNQR229{8L+6P8yo!w zo?vaAW2EpL_wfUEw;o!uf7pAlVzOTvj{6bnJ?y*bZ(l3<FRu6R=-e(|(@?=jOoZ^9 zD#*&R^=-&K+h0QO@=cZibLcIKzYGv3i>#aXAPhxEgdycts7%if!H@h$bQ<NVfu<5C zgNFS2HDUu*$A{~oZtW_L3szCyT~E@K^w7K=RIgP0SmD^(2UgV^HOc=OUQiQzu2Hd= zR_504M%usHevOWq{(GVgwpz8)kkRV>a2{f0cUpV*qmd#uX4>~7`Gv0PT#rD1r~G`U zyzB8sM12E?WPjm^i;gSoqgbar>wr%6+OL!OZ3klqJ%KMF7nup`Sw?Rrfng&&@cMaj zvR+*b%q#IKJFrb{&p`3u(V5^7!RfQOt}S&;*M#nm_{n*8yVp6srBBJ-RDt|@Tr8p~ z+8e;O=JuGHG#sMnSBGJi5#yk&oZR18T!dEo9yUs?pyw?}vh4iVZ)FT*_e%&eSE>8? zglo~7-*-yuq=DJ}a%=cTQ+!DgyPyV~#o{<PFpM>rT~^uEF9QI``EMQt=zdm5Z)2#M zSkogzys~}22+bf2PwQ%G%o8qh{^=N0P}W9)j3`BUI^Dg8c*VTB>YQ{wC6j=71!i1= zv+P<_i3M9bKZFC47aT+rwy#-d-Y{8C)+-*&`_+E)v&k!nxKt2@@59yUU>!hh0Hqu= zvm;+^xHN)F*-@Y%95d=^^;~kvM97eMvrKSxobCZcuo@pvfV2JhhZ|K8Rj#he_)p{d zJz#;rayA)GWTVDLeV$L@fI@^-d^;@*^O~F0f)rZX0g$y8DnsOiiDaaLoI(N;dL$5m zPeM+Ur+`3HP>6mryy+b~#e8RZ@3sJPSu9t~D=pwt$GXI)Ze+`C3^9#QN*bRqA-@I& zM8b-R*aZcxUvP%SHu}Ui3O(Hu@?I>}n$2!=P5iS>-^<%ip#z#Eg%!W(d>oYMHx>k% zomYDNLzC*b#y_m`^qo_r*5#{8@_Qw=A!H_`r_gm4kT)`=`uxB0IOb(jgT23Nfh$Cf zEr}tGSYt*91#au{=|vp#e{_y?=kNb}EBWsO+0(X_H8pP_@;d5ZVPA-w_vea&jyl5j zajD?!gs-g8%SL5nBSjpg8wk;p0a!emh=B$ZhuX6Zwsrg8OWGV*f#qJX4ic`r9GAf1 z?eLnoBda{!wB^FMvB^T7jWt7Y*6W{R4x_xp-#p#?_1m`>5Z{l!o#b8lc?hS^Mm&V8 z`9D8NxUKOVpZ{M^Sp4TR7(H-<^Rdd@?LMTp0ckUiyK70q9fP4G!fG|9+AiPz5Pc<B zaB<r@ZFyYL8D382b@{q&6oVC4l3e#kdVY1qs%~W$-vW-;cM&fo2tmq}E<5gtz_oG7 zN!;R12L(H~P09rYARzEkbSg@!QzIH(<qhj@7vq|Q;2@jevQXFouY|=LVG>^Qvyq`J zcMovLw8TQ(H$cQ%v&v#J(>@O+rKsd_5L(0F7u=wxo3y^WmHX3i^?JzZSkTI*sE|10 zO2yzH9jW{$l~1WLv8?CUDAv5CMJ4>sH#7S;o22ip_qPuZ9p+p{t~#c}F1-WZkNK9R zJ;Y36nw99!_(J>2CesT48!Js&PJ^<V(huOhh7QB>v(mkJ9^-NwOh1)uF0jw#byRK+ zHv$ez=`|2;m{*7+@Zz|oBa=+)JU%*zh8^73q@*ZC4qk7)e}YMJC?$+>1N|=V?KW<r z3}MWIXs~a5XB6yEZEf|KMh`=EU5!bu@viK6PR&m}j9Xr}#aPyA6-G-BHxy=;6w&{h zyYPerbq&!VFdi8-`D=eG3ov5w_eE5xZT2vWV5drLMRirJrat4*@~tT0&nt!&q`_Qn zg4tJ)k)&K^<x!4PvFL&Z#y3F(&dZt_-~RBdb)5<op2@^^`gUK2A5X14NNY<u6DteX z;D9L;MTJHRhY#20>y}jz(U$YLy|AqVPZ}5%MrMw^%9YYCT{?=I|F}g%iIV)W&b|3z z;9{!}p`cW|GNUllMZmdX*4ZM?QXXqsDQ;}G!NW#d<{jkoL*<~J;EL|jU80$0bxp0p z9~(3C*($TNKhV4-fw$}oOX^LA9am+)Azt6P!<L#JKM$S1nv2S%pv?ZueGCFW9+5)L zklpAMD`RNNl6Zf7xK>BMNWN9;F>745WdnhPYp3EnayftLRxtW1<Nhiu3H#`rQx_Bb zU#JN#pX-xNO`K>zSH4A2Rrz1wKRVe}J7$yT?BR@LC?g}@P4hMT3z>etwo`oh9BAj+ z8#P)LmrRxtKg6Vfx`o}nU6fS2%$lFi+wDdd&Z~@&m%U*DP<}G|=o67t2j=?<e-14I zjcHQcGwPt0&v<iOGYy3JkgJ@Ri~Ib15(DL^ezEBh0+EkbtzP=`yK{kqo?$g?-X7&D zj$uJ3Qt`QtbgE?)RSkj-mcyd{+~)e<!axMv?0m*!Us|xw`{lt7Yx8WKH8DKYu(Gn1 z=+w(9X_0-HI&%NJx0$<HBgKvm-+5LiYautvlw@_fFrN~d63Wz@I(R1)B&mCIe|0(E zU!U*WGh%u!J25%cq~rO)CU7#fgL?y$f^fl^g^6Wua8J<AA-|yE<j)(;F-0aQ2$DV0 zVCdlbLSd2loW?v9q-3z9?8&ve?+<R&JBRs)#=NGur2eQ!pW#KG4JX6tHe+#+vw7Wp zB1&o@jj_~|k3w0mT3@CmM)Mgp5<E}k$dR7E@wv%TyU+F*^|aC?PyI#DNe?};EV70L z4mzG=sg1SyQp3>6(1z<?tW|1VXOKC{4e%0gtuBqLWNuzvBd6M-qhsx_DV(l9GAB(+ zZPC@Nu52(S7*)HMeQ(-hMh`r{ZL5<ke2v`1UTIsthqIm9;1S8=ZleBmcK72R-DU!} zE?3a+&mO9W{dr{|GfJv;>4|#UCLB(@CeKiG(%!y(E30ddg@%#Jp}x7>g7B*06|r}Q z5Qz|NF)jZ7{%WK?B4W#yj+Q(=7N*DHA1Y^ZPMlfz=_B2Ceb;dF9W(3Ms$T1dpxi&3 zWoh?vS{#|VIei`dRR+r{GVd<7P;?PJGDNyQhNbX#4*f3WR(2g9x=594zU|JyJUc?g z!jz4R2|Ti+;-;n*l~MG2EM{tDCTEmMqc8bZkM>SuBk$@7105}~Zsr*XB*nyFDJf0# zJQS=QbX+xIVJTrF1hRpA_ADyQr^izq6n&d0!=c53P&>mB6J<g6;AS&~zbQ`<<- zUnijO(Hq~I(J?l`55Cd;cH|~;7xydt=<oi)VLEyF*rOOhg+7L_N|<jGJT7Tp!@>WM z`0-{P>z;l%xtNDG)_b9smWYgqmRg99nv_O(2utv%?vRue-P6_Vqg=-8Sr@88>gbqg z#S)VXoA-ST6zCFB>l6U&%Ha`Sg9L$2(b(87f2)jSq{5~GTxNol6m5wUs#|}>=h8%l z?_?)9vYJvez^SO7Vp4LLIZF5-acgiqpA=r&igNz?_~G&IrNJ&H@3*tdwo<TP2S_pL z0qJNl#<z$OGxNQ^i56gpK>h!O;!1-+ir9*ITn<NSs@P$#Lb~-UMD)u}OnxDLc@1nm zk^7C8-r{8X9=gKq`W0m8VEtPd8Eh}1u!$NZGtUvvUSg(1hev!WGQDgddWj{ny#rYL z=gzU;SvPC0@fkP|HPTqvm>6)->NgT#G0C#srya>;BuyC^;dPp903inGNPWAK?k?{i z-80(_5|a{3ur$Ct$eZQc&<@aORO%DY5m6HL@Q;ttx)AAxR{XWmk{TNg1&F@xJHdCj z_PX!%tHR0p;!F<*pNEczpocSg!$hauw`)I)IQDsK;kvve99mtglFSS@r|Al)2L}0Q z<u@1xQ9Vsmfdvn1@AKbh=NSGyN!hkmmvA<6j!E{}KRQ(!|J22#PwIJpj?p{RBdD!# zOO(ff6Kr&S(Y$K(#t;eup17o=Ygx>TF^XB|Fq1GdluAR5%1)=1tBreN%2?hR8(aZy z)xJCxJ?--&Mzl}!V&`e(EbkY7Ag4uvB0ryD!ofqMq+yp((pn!@ctV<)nFTvdX=n<2 zh+{Tl78bPrdE0{zOXG4jt%ljghjV{B2L^aNceMJlK*PMcyuF^;kdRQ=(G0)3vf-Os zt;hNPn^w1~xW0V7J+S{(gWr-4p)TJ^ru2{BpNF)#6B6Semr28yW16P)?9KJ(!0e1i zF&{3wtsh~`@DDY2*@CB39BgKiA`%l&qv~y2vixi3Sk{)OK0-s^sM=I7)*bvz>K_Mm z<(X>y8u|bw_}E2*mBRCdVp~((L-u#?OOrF(*YdD-Z;&<wmeawg0WCSyIucipqhl!H zC|~;m_z{4V#Uoz(nHIf~y<54F$;p_vNh~o-iPn(CMo!<cJY{-1`VPmxJ72fe-n(DY z<mDN@AD!e-!opg3?V8k|SL%6({xdGc6A&!I<qj2_$}HI((qd85Ju?mZa;Jln(&Bn) z3FVVEl*4?y@SsNYU21)E>;2R7sqdQMx;&SdnCKkt8=cKLDi8=f)ydY<A^XtsT={}` zaME69PL@hAEj%wY?@R58Y`J(c_wDo6bQ|JeRoZMjq(lz&+@>P%c*6O^nuUpt`yGCI zW~tluL~Eu`YiI&~GCo0Cst&)SMZg0Ag)xPq?b?B?ro0UrXIJk6&r3J%eeT2e<fL!T zS>wS|3BYTzz_R)LhY=VNG3%XVE^RTgI$RD<n^{G9Hh{4L&eC#vgL05kxm?*;TKhPa z(=@<g+=N7<JQ35YtA;yg{=(;;dz$<7^h~GObx8lPcv)K+?6Bi~x#*ekn~apzB5_p7 z<?A4cFc~UMA9Wt??l^#z-rfOXN1<`F-V-WYtRcYA^b|F<`~M74l#z*95O;`*NQ=ls z4fzifesMs@M6Y-6&1oso){<N}m~?;VPPdC0N{k{V_0Q$H!M{x<&=h|}cY}QdZK#NS zHUmRcDOx$+T)i3L`1TH-+d8@J9Ujir>e~3|PfiFRJGMyJqyJ+Y33!abXhW9=X(t`y z$LsRS3YeP!!(5{hd$q;!@imvVjNGus$intC#pm5C>uJ!5b!j{v_RAN4zH`SNt*FI^ z8cXZ8*&h%Y9QlW37QR@#kp_NK4+jK%?nh8?a<iM;MAf_Oynnl347%Gm)M1`gHIXO$ ze^Pq5gTeWSq;S6GM;2;zfZ&JoE~bD=58cU5D}_!*c6a%a2ROddtTA0w5Zm%u5Mn9M zX#JXUk(+v$`g{C&aK2n4GtC>uTP+vosa2mbMg6CgCTHl>z~q`Al{Xevx@<JFZU?bt zNvTWRd<uROv!Psj+3$SgWVtzGoMZA5a((T*5woFX!-s!?{_H9+y)d<0vevb$sHxjw zes|X6w0f(hC3(NTv1W`#QMXu%0p|<Q|A}v&uYTE;nyBY#J7*7&ch~Skooct0NoMvD z;Y;MAw0gcgt&4xRU+ojSS<+lh^6RK+a+P!2x-&fgBXcZOpqZ)61b_3MnyaR(ko2mw zcK>Cq14}y?BDSoSXx;Z#KiVWKsRT{6FVN1Ff)vKNW}>%f{F%4d8d~zr=A&Gp@Ggbu zb9kzjR8-VfR%S^B771_TLPR;5@2=;_`SbjW-xslr*QO`ya8lC9dER24nq$tn;Ew85 zY=`s$zF?k6U6$I06Ke7DS&X%TG1ATO8?KV!KnZ$4AQ=JW79Bc8U9pU_dZ4&O{r><e zr1nv`A)}R%ixvj*Gbtr5Hu~|v7Lqu%OertB6BnpO(@)dY={g;Rfref1zGPc>n=wx@ zQ=i29tH*L?o1Nno#}W5nZx``<+G|oGOo#v(YMc9IIC8jEz6~_Q7ux^Zx38d$|BSNz z@=gA7cXDb1Jl+n7WJZ%>?w<P-E)O;KKNHFGCEcK=x&CeEQ%Nk?0=>BOyf*cPBiX02 z`EC!^VNVros+VJDg+mAfSB3rfHW5cDRy8IjniLKl9hGZ*EQF~})(tx>$1%qM9~taL zooyTOZ-jwB5D)-gT@UKNS;G#4K#T|2;1R&TWL=bJrzf)UWTA5te{G1Fx9&d$HB5%2 z+x!Mf;RWOylWp3PCo+WS{9bF(s?x_&;~SSy!|A^*Q>IX0pxRcv420~BgMWr<2S-=> z3icYm!6aGLk#|y@oSwXR-GPCHZ)VM^s3SqoMN2_P)sod>P&mraUsGPb`hJ@9VCN8Q zHHU!|S1nbWk2Wd?Pt7tc!a-)7KhM`OK!Hm+AWV5Uoh5{)VM9I}>m{ZpFHOu~V&OAR zng)fnI@qa6o0zC7nVyHt=0@kxR;rg9pB1`mq5v&qt#<gi>JKhNAk`r_JgqLTDKDqM zXrL75xk&w?=?3vgNn-`JwG~~HqJ+gGpCXuH!4#jhjyJJ$f$nRJ3nmoQ>YtB)WLU@2 zjI#;fU}4}NcZ^taAqQj)$fzmR*EYsBYicUe&{6|4Gzv57=kp(oD9o3Tc+EjXIfTa5 zZ-SDdyDM~c<eU9XO-+=tu@T8cI5(tRBplxgGai*Xs1#Nh6*5NeeeNX--La84{_bPO zyt(9k+`FnT18KF;yr8v^yiI&=w{4o@1WXg0vXQZ7_}2)TGJX&*Ah=tQVL3QBiw|NK zzBM@<M=Ne$eSkpN8R{-EOPW-*U-?3lWq0P`$^;^YBH!Zj>`li4+v*Z?aHc11&lDsJ z1{!;p05;^yX5!h6rpeGUm+j@Pgb>O~ewM9*U~jTt)$k(6#mIP0c7AnzIW*)$NBhLl zc-{HL0M+=#IZ_@bcC=G{Tk>dGv?BxmbLMvqQqfcW>FL})!Z7{J6XRB$QJNYP8gm#c z0o<FPC$MybNx7w>E|M^fgOtCp-(cA~U7eh-hX5rmXXAd3Xg4a%Oe^iy`~a>Ef!O8l zvaqrlnQY<?vp_?DMlRK;MlPwIwNk@*gXIo-84$v@Vhn7oEOb?xV|gN?6o1B6D>3e5 zyL`I==(T-ur6QrAr6UDyagx>Tfvs2Z0vVuh;18g0KfUB9eoReHlz?j<ghs_AZC@-W zZH4nl21x>CjdF|IY&1;q?a{0kN23wfz-<;_F!ZU3so7G93GUQ1E9T}KD2ST6IzijF zgHFdowYsgpVOfACsvc>JWDS<Hj3*RYF*;J<TJ7|8u@x0J+?Leys_R^4Y_))X4|I7? zZ=bE}gI7{qlV^siHiK+znw{ZrU`dhV>Pd>+GM{MEHj6!LZj!8~Av0lUt<%j5;i{dv z+d22hBC`S=&o8s{duwy#c%P!iQeB(+sV=#`i28L6l(W2g_OLTptF=%AbiHhr^otyU zCMH-!f&ek$_;4iqWEwe!KD^u%;h4hkj@$+)GrlKXaumpi$uE6M9lG~2x1>Zpo{v)r zk1Jpsm+-Lb`r>5LJ|IDN2_oWNpS-$wY=fOQA8dODhiK?&)$1)E|Gs#h8KGCEJQlrE zElK<>4`WkvqMA5aK281OGThTuw&nlj1qzs2EoD=PThFu(&0E;zD&-Py<h-%3xwu5O zFs1v~HX-*2MfUS0x`Gz)Szh(oGcQrFHHy5Ry*fG5HX`JIpUFc0AFcKO^}snYqN$bD zD~OWnWO+r!s4R-16L|ZdW?gQge&CL-z+*SqU|&E3|If7>B?gbE+{7%+&4b@@0XD}D z@$k{o(@6>k{?t>@(Q>wX;Qasa2Cx0DeY;*2{PPA^n;`H0d*AJUExmoGK0>jh`{zLZ zKg(kO?PdPo>tz4qfT-z`Pn)l5tkM-qIK1rp0eT#3Yp~<^7`dR#44+Ka#Z5(0TJ-6- zIi&r1WIZG9BUq1jb!+lGlvoJQ27aL8*lHD@tJ~cpV|8nz_mqZ<M2Pf-C!r9qwQ>tu zH&*;ifY*hds&!UebQ?h6MT~yW$#W@2QlgBQl$jhk9nLqV(e)w!eZi^oo8t!qQz=E7 zk{ZOo)6Bl|ve9T_=ogIC9}a`_JYC$T)Z+Mr&K!=aU5!7<J{@B_E{ni+p|G=q-#u_J z(h_-JDY)w9j&a^~rulRX&ecKTM{FrpEj?N+Z8o*Cri>8pnB){CB3s`t-J`72k_;U@ zc79X7q^~-0rKj7$xH_Liq#^YdJQmq&|1_rBj%rqDb8_)|9b-KMxOQs~0|B8aG0%Hw z^L3TPG-1K;-{_Y8lCj12ycU7>gx9rcrQxJbnF@LoaJ9I*;bre=6^#&XY;r*^<9)d9 z3m(W${$FKWWmHt*7QIL#($a|1(hS{Q(hZWLfaD-44GJP5%}95HAl)F{AssSEH%L3= zyW?B$*PAszW=-CE?wRxLv%h_IyfIgEnhTW<N@PIP3H1A`cA5`Q12Q$VoQi`G39V#z z*%bYB4_?2PB|2<rXL&}C0latHX%`*EFHCy-ik!RV@u!%W?*6GSm<E&jy;z?gs!e2+ zt0TF?XJSUC=E*23Hl`ROk2n$;r*SCbZ8_e|F$5QuEki*`m~2jV-42}MlE3+@F*FGu z_<JF+eR8)L-M-Xa2~uT_7F9YY<xkv-S&HLgcG6Kds{3Y}c0c)Ka)MjnJziF#q%_=B zDFr#-xD_ZBoZup&BH~A_rL5mbV@Wcgf~o1%N9<{}dgtb1w^a62^TP8sT)goq3_+hf zYgYWs&=WSlt{6J=l2Q(o4rmvE<6v+NKLnbW>Xup|u7*ZG-5J3nn>|!u*Pc>R;?l7M z3Ny%5rNcA9rYHD}a`cgx*cBvfYH+7;ISL;DZSSwXt|wM~uWV_!?)C0^U`<VI754}F zlcxh)Rml3Zhe@~?QPLDX5zqKhA_P0)A(MPQ{}dmmsie7_HVePNS}635r%ioBWYYAB z(RLVS0QT1JH?uqwQq@<hJCa(`FST?yg;`s_<$uBMqrT%v2}`!U(AF{92=mK}`<&Ov z#-X{00gPvq-;Lh7BIy{iYl5R5^4NG7*VAD|&qVCtU<QS}a?h%NG%qcz%&|!Nzt>0J zF&sh~0Y1I*b7JnLnQpNXv%au6Co$*|q6gbP91iK_EL@7qoj6ymdCTS?74E8XIiF$J zV686eNSqxig@=Df^oq}h(k?FFic!}+QO1wHZ?xshjp*_5<LIi+@PU%}&Nc-aL%*uR z=FpS3@66_B5Z5XvL%MDxi^hwdoWjBS`J%_BJ}}xZxRH9}F<lVIezkb4!p{dP3`AT) z@T&_>B~$Z!jd!sq??B~BWx4I_gsj(2$cbH**7BNf>w_zMdO~Gp0n<O;e_!R~L>ATb zr^iPhWq6DI-Dz?23pme;$SMlV!fHQ-sT|IVMYw<tA<L2>X2Q;bOhVaZMMPbJi2dV( zx!%)#YKN2Ec>0^y7jx7g!;N`+^+dwQUzQ3QpF4S&{w}~5A?d|$q@b`5Ph`Hjd`dYw zvn+LtO>s%DrKFm5zxCie3}0milGQGv>Ew-)`4qpMn-*jp@&5(>?c@r2H25-4p5#0> z;q%?I*NNC@bO12)_w);>9Rx6R%uM8UwAw=qVzz>g&fV4P413Kvp+?e;>vB9N<qeT< zZT%5*1LR%@j=Q)tg%RNVB9lJG1+QoDm3;DuAOo$R$68{`Lf)EZ8aUZVn#v8~Y07YK zzuXD0FnHQV6L3A^Wld@gNHMpG3PaS)+=;-s@u#K>o`pFLC##dhVmBu3Y?rshwCYbr zd86@OlXJs{tPfv$<OQUC4xFp`Th?a|#rgaP1xVoAxTp}ZZFKlF9d3lZ^1S7PSluE_ zyB7*|yllC2y5Bg!*wg`lK@ptmgR}v~WM|b`F%4b|ia+2yyn(m?jfuDof*tq~iT)|| z+eKTziPiL|)khN+U!Se?OERy&jY#*FV+2RV(7o_Ir^|kHcl5pAQh=8zhU^-M;G}#F zPf|<;klPchU2!{=RuHb-n9xj2Kg%5~G-;wvotb?jb+F-PlHfCn@XsLjayOz2Bl&&X zWf}Gyc-k$P(_GzLof|*Xgz+gNG(IA<TH|Su)PA>TrR$l0Lv>|!U9{EmR=+(|$7VN& zirQrVMaqfaFN=w_>VlC8`XkDh>KvnN_O^B-O8qnA2J((nfW_>NTHRv6%M?tqj89hz z3!>%J-E}Acgs8f<ur&8pn;avuE!pE<Xy-Wjq$Iu{iu0I8-V7yUYHnF%_YLC)p*L1j zuVg8Lbphv>Z!2X2-}?Ar%*`;JuOc&|#hJa~dBH+4D#BGeCnxvw0yyfQQE6?hJ$gi` zE9*-)y5CS|9sOt_;f3+S%LSE_!Ni(%WMS<NWwh6r%>p?HVzQJxanmozWx-$JnIP@g zEG$W6LQb(xYUS&wzUcPu`e58gOTFj@pI=$s3#FCI52i@o?)7>A1X@}Au;9kpcF&d~ zc%$0BrsxEp6Y{3XH=J`eO<zh-Jf=xc$j~iwU<U#@eaYHr#ySO@%~_~0Q?oawz)-k* zp5v0E<X7hsk30%VcMZfV8I9tcF~4mvoHuIv(|$YA;pt{c&aFD!I)?MmzCy`GKm8yb z2s9s!NUuGeCBDZFA;Y!erGLm4CC6}nhU@$x;}UKv?(Dh8bDh7pb+69MuZ*jqHLaIb zagPsVeamkZs@FA2y(@E(d5<IR(b5x|KmOXZ@o8yNe5$ObZXn23cPxkxaY=}mXGwT1 zu($4Ujs0FczL9AG2QzTFamm%4qUtt1T$GGeM9HAbFW@W?ms5QmEk!sdR|!chL5>(J z%b>ZLi84h{Y*=h)W~zBoML`RU`pE_V!YN$T&F9=H^?-S%zTQKu8DSUWBF_-a`oh~d z&7Rfnx{r`xgNtjIqqVfUw7kUZ_a$u-5hRC!uDC4g!IC4?k=A)XHY*9VM2Mu@s`}tk z6V?ACaA)W6K-#ek$^<n?JTGT3HZ$pA?w?$q-#Om6hl}00opeY?<Xg8BD_gB@SK}p+ z$Cl*eDtYQRw$`??6u#NNn(M<8am0x#tZ#Dhv2PwL4GhB}cBtpzH@AFgFgdQfOjjrp zHq-Q}PY!)2Ck-C-6WyqX_%TzY={tHsD7XM>R)j;LA77jdMt;L(iB1_!Jv@bE<uu@c zQh?>v^WR-=5zj%oTLGp+OsCCDRE6A_aU64M+s!d735`#EJ*4GxWFvu!S0bC-I4<-{ z5@RHrp`CsFTJc^;0XAvJYS;e$3KbwAI@#}Kqkb5*KQot!?V|nS<%y>TxLa!X*6D5I zyEBwnrF~kpm-?1*tHCyc=I)*zr~tSIz5YO~x}}Z@V1?u4b?1Qr2LOrIzY7y{qT3Kr zMy?xUKG0^S;P^J4i<gY@azFsWeerG4CY0X3Cr_?q$o}W@5hZuAf{bC6z%AcWOjx1D zd>Ena;KG@KffMuq1ef|hvlUBKZQPMVwU?z&{3}pV0P!Vp{87otaf4^g#*+^VP9LF4 z2IuH}=niJs%qL6X5};^aRk-s^;WnD^-&7|RF!52jJ_N7I9oUbisy(zHw5QoTd`z$c zEwY~w&|LQLr2(XTnwk#k@<aE>{yaPu5UeYC%T_kmlmuA;Bda4}Yc9F(y>;6o5p~US z#p?!}8xwEXObEmZ@Uah#gZU~_wP{cfgjq(B)xpJgE>7-zWblI$zib2#wcVMfCr~%l z?G1J`m#Wo6){-_CN345!c(`#buJM=V{5-b!jp;G*I4^pyHuahv#w&3i8jOYEg_}Dz zJ?iUZXJpG3)=Ny2lhfWlP9y`*+1|+^Swz-zV}{Z1<jji#02=)NKC${;{gjN-n}Wpp z>-Vbai{jWx^y60pLxNo3T-PgN+LSz51E_IAUJ)Oonitb1%3a|-gvRmjhcY^_^dS}f z!|cP&>d-d-zZn$<7Ot{G!2@xa6-NcNMXz{WgS0>(4MV7R63OW0^BYH35XX}u!}}I~ z_}+YdbHfhm$i-2VRK_lQm@cwXMcdbj3s}6d{N?I2w7WDk6v5iV{8_t>aM8g`$aACY zX4q0f`pMZIsx-l6;9|=IvA4mu26ckd^=HpxT?{Wv(uYXP>evULC+fwa0B-Jh-*rpF z5<6-rnh?qI><;Y2Oe%4qN=HJCu_%l6yzx>`1*Qbj{og0aG4MbveZ0NlO#z#m8)(43 zczfbRy`Jie_43R7&iy!b-z_xZ=Pq1XMG1d$D+Es`9vaRY<}s@XOxMRa^0q%c628oB zv_*eF=Zji*ek8FcCCrBwQkOgO9=##@mrGuDV5~F)5x&XNb2{Y!{dv3mMi!bum|k2m zDANaVJryoSnyaYD@7dG#VK7}5CZ|uT!x89S^7+BG95za2AiztDLhTBi_p?3?=ZGjB zvS;pB#xvXa>jOqnQ}MYqHU2zP_L<l7(}%}+bBKY_f%1YvIe3o+cfwF62&r!kWRE6M z2YR}cM}sB?LWn@rAg|Ua5!CKFB-U_IbN&f2L_<o5)lKFJ2Pr*O{bVfQsi?R%9X;2y zWK1k!DOg~CXp<+CU$Z?s#lOHm{o{)Ne)V3+kRA4<xdH&TLn)Vj-UqxO19K<m-l^$h zVtQg*#vc@MMP%!Ex;}1Xd{5rd|9Q#Jx_ZdAeimPn085TgIeX1B-EhYlPaG+q$wKNf zyXLPZISQt9r+6SdQI_s@{4{tGpOCr}fDhVwW=oG$(O})m*to_cm6B)>lK*v<!JwS( zugd#{BF?dwI$C5m!fB9TIffoIX5Wh&7?IkSkUO~K`OclYLiunvb*2Hz${o)e#ryJ6 z&U!X3f(e!~T}&jSnk~zCs;{TN_U`iF&4$*g|6#Xh`m!tiwhp;)u+-Bh%f21wzv9dC ze!2QMnFu?3)(lb-RD{w#F?$aVdE3LCs9OUpMv8;KuY)HjH0&zF%+Xc(8|=p<Zni`V zERhF1fytUI2>xW5ep2*IXBC)|QA5!i1!S5S{ytQ?2|6y<2k0OSbZQZyy5LB0L8;Tq zYpq?2I?a_JeyxDnbKQ<FvE_fZeb6e{GFr0D0vAWPh6tyQJQO#D*9z2s2_Z*D5*Leq z*w{+hT~%pyI7HImV<~5apZD6vdDo@Mn8bkk04nf?;Z4BJ7{2%mx;QZtc-&Unu<D?6 zp%J}~o_@8>JR!e0quUZaMADZIcepTX@-#vT$Cj!n6d`|3Cd)tsHdrQ-tKPStOYg4< zK<i6rUwgxKLEF_9_24V>T)$R-*j(6=_q)qS8)$JrV7xKsh7IX~M0GNR`#D6<Ob;fu zI^i-O*)X05D*!i@DuC#!s0Sjaq)0xb2o*f?&l<rkCG1TukJ@r4Z8wfn!bSAN8G*pb zLE<7tALkVcJ6y1LylcLtOn*cH{J9(W8rNbDbKO{7al7@MUg=+uaNSK<|Ei)+#C-Ik z|1%7?2Cr;yo|!iD8teuK`&GmnHxb*buKFG`Qb)A0LGRw8f!E_OGa#H(B7-FMs%J7Y zKJ_CcR4r8BJiuMSB}d2#M3Z@G%@ILCDAELOo0>piRCe#BMs`=j8zeo25G~?4<lq;0 zozppHR0giEU|T8p#IUDpOx!iQ5-ji<g3@}5u@Ry4!jO@17dx5v;Yl@0FH}VUkPBi` z$a~VfCYMY_g}E14?d)FTu36sQ+uJe2GcNKyjIdaL$Y!K685@X6VbFT12|DXomLw8n zh<WCF$t&yGyNa?=saR{5r(|fmId_uiIG*?>vQ!PCs&3_GNg(!8vpV;Q+}b$rFF^s! zXjEqWt3lR*gvYvkf;F1uaX83?ulz{4`3(dxt_x!nXJARe3CD3T{T>irTzBDi?o}r# zy6*O+7DV8hr`E0SIu3~O!D_r$Lis&XHeH?oG}SzeQ<5Z0K-}?=EP-lLUQ+TMO8X}B zbEQbtY5s{R-k%c?O;lhzbthxNAN=bw1s7)N7!}ABmhQ6l-YWg_@}AUuAy@~&I=fq` zIcccN_BPc+jRpjPz}THa25BvSG-<oi-Ryr-xm~G8pq0Y>7pog1f+%#)y%FK|+2u8v z_?u<S>hG%;=JUliWx~^{TOj9fF>~?j>;v)T1;PuRNwM%NnZ7E|Ps<;8#psC<U#n}6 zM82B<z?jdGrl22aMEN@CX8~&IWB&Edc6rHW#4h*+!UDYYqEO8ltsj;Y5aMd$&N%+U z;%8Vyn`HTtvW2;=I<n?gEcv_;OASd8k~@xqnyg<AZL#Zel|?^<{IfIJ7P1atOQ_|0 zkB}ke38dWoVh~t(5a9pOUGGs(W4=Ht*@A=F+`=yp+hs?Mul3%&DJd8wBWmA3I)hmi zw!XSDl~fW(ov0t9*8(5#_?+K=(}d@s<uSm}gAD|Rw#^O%ffN-FoxERoRqTc?QSAEK z<KHfWj+M72Jhp^1^j{iim5VErFQV881I3!omhm&OXk^cym0RhV=_V&9$eNY#{SXZf z&5RK1A#1u5d6|!tB4Rzx`eYwA#c*8JiR0D`_K)4Lh~zL(WhsEC4+<@7TJicmOs17w zZ5}g!DP{$Lk>Zk{9oXOrfrkzRf(zsieAKc*wSh>)CrMv=kO=^Q=R6ePeN#pCs=LFI z!YT9p--&aOf5J7yz*7e;gV=W+ilGH-dyo4&!{gH!IQf-7q<%eJi>T1C*?s_kttATl z+4M&3#{@Wh;oa?bdwmy9AS=6W;1u30EKuE_?qL=380=MKP&qCz{^fEYWv!q%F~@AR z1_4@!1-G3}!qzvw<@!M*CEcuCnz5cPubV@SF+xAygYB&M?tYH}ph+YJ7DTzgOLi_k zI=OKWiLUT5!O}v4kzc^IZ&j8q4vAOxF6PyBlg63xFh0)v^{%YKjh{)x3XyIWT*1Ee z&1PTd^=al3FX!9jg!Jmp8X#7Pv+c`3&pi<DdThh9?VB-@osHDp<Ia20F=x|2W1O;5 zJMNmaCSR2!)^*!=Mk$lJ6v@BQX24P+!&r3OIn6)gB7BfRnHuKp=pfPJ(b%=R+5%B` z!W(p#9@(U_Q`G}eCn~_r!z+H#+|haAO8(ckdH&XManMmE42{U@b60Gw%FI|Pdoe79 z;kemKr51N8BlH*BFeekCG6ZLWy-YH~V$H(+vc2b|eY(D$y0+faBnZ!%fMmlHzhSA5 z?M>V&8VIbY)ArY6YJFg1)wM+|<q0Va(i9ZWo`ILPF5OV_jO|~<+k?T>rs%72yA{j) zdbfUF+lXjjG1yahWoeyL_T>2CX>2{n&5bk3y|}_vlqDotQ8XZ;HHmIXak&4pocOuN z@_Kd=2`Vu08brMIqvf0=I8EtCI`sZcV31@_<e8irjFhvM4vewqsK2|%!@@=!ji#m- ze&TBa{d$n{>qPI5-wpSL>vOSA?wwYBiDX>b>N7QBLT6;aj9$Mdn{xFacP@#2hff+2 zQit5wEy6b$B|(06jo^P?=RuqQc6|T;i5Ro7fteY7K*~=I)r$_lzg+`-@c%aCsDU@p zptq!!0nvXc{XnoBtEoLYVIx1$+yhM)LEqIBRuDdb&J1&)ofUXH!|uM3OjG{%_O&Pf elh*d%qiC^Zluu#jl7fRDfU>;0T)B+JhyMW8?6Ud* literal 0 HcmV?d00001 diff --git a/docs/assets/design/hybrid_kv_cache_manager/full_attn.png b/docs/assets/design/hybrid_kv_cache_manager/full_attn.png new file mode 100644 index 0000000000000000000000000000000000000000..30eade5c7051cc050ffd0ccae797fd083d4389b0 GIT binary patch literal 4120 zcmbVPXH=70v-Y6JL+@PyL6N33r3S(gK|o3XDM}GS4ZRbD1QY~>NN>^!9U@>LbSYA# zcZhTk9Vr1sYDoBAzkC1OwZ0$s-u>g9z1Q0B-ZQgjo|z{~Ur&RMnw$E{l`C{wnrenu zu247tb0ta&;QP31$^Oa}hVNQxkBkDcb{B$_CeO(22Ml3qwxjR7qK1&7hUEpqDbWl4 zP9s!q2D?=1g@toe6$4w#<1s0E{7;3L%KR|54V0VW3%!fkY6RC532EII<Wqv4Nww7m zqK%KFxVO>Chir?ptnzp^G>1_6eir*dUsw&e`^xF<<pSjIN&6*(r1T%a#L(LAw^oJi zz1IMt`6hu15b+eZSpaeC&nI^Q@i+B*9YDNh<xB^}6_Nk>qb6-*ePyM%k`iL7+&DTq zT0X8Ai<RQ%|1vhl)Rb!v$BkdQ#69pu(^!@_5eNiDMa8`n>PSNqlZeCK-={+2Gc)Jf zm*jt8bUG?OiHUO0PRG*n|2j7AezeCnO0sA!Na0=RQ2>F=ArJ<~w4RmwmQAcpqv3>* z?|#0%HyeB!63W`wh09>$2&1-%g_%DVMjAk7KgN(*Q<IZ89M02!Zf*{^ebaE<UON3M zykkw}KUnb}&9`rmOGoIiRjt~;I*?aTPg1}q8jdIPP3-@aw$a_Fu&(}UE0(I#N{X-& zzu%dqgam6FaG9UX6qxV5H2uVy665efOY<pooB3r6LKGGN@>>^h?G}Bh<rsv4J$r#n z%unQwaPMq3g`f_JL}Kx<#lzrG#R!48o}k0^p}g$k;^LLBF`fEtK_`SHA;{|5T17?0 z+Ulw#E7jTAnZ(P_`X(lISZqT4DJ16=`9_IewppXMVCE3}Zdqw*gV)OSq&{3-or%7F zJ6@eq@Xnok&p-Wbnf<lRmCs-M=~G^%fLr|hQX%9Et)#oqZFUY04mLLI^N$j37m2NH zOOL_Bl>Fe_kV`Vj7-M3<#(jVp97G9)Dy4!$e7C(VdKW#0%&r|7@lbQsOS3@uj$z$w zyL)>y-);F?dk9WrLp$0ku5d}wmNXPvjFmc$2;w=kB<_q=_z3wE70q$A522gUxO!@O zA_94YxHh~A&8-TnpMBR#Ac?U({^d_lLE`m7;&X#RpgzYKtskUq&U`c)O#|NFni8XR zf<Pc*Vwml3RexQ-&dtqjVq|3f?)c;cc<H(6X*&l8!d$(_*Y{j|i!Gg_Y*8qbkC#^& z2E($erLCQ*=;!S3?{8;U_~y-*8pn}+UnbSEvNGVTlA}0cXJvi;$<9no$i+cVVc{KR z<rcv8%RkP~&oeSJH&$0~D*E=ex4-ULS^B{o7!<U-v-7#7MZ`76)ehfAI#b2%9mk^x zWUoh;!pU4hx!gB&md0ocN}3OIg-8&-*-#;khBSENM`4M5BXkd(>bv^JP|XV;#YvXO z^{2<R%N;J?vj^cPC}q#%M-aFpzMF-!NdOBqpEJYKJ3Z>0TQ8pNjEZCSsAaN}J?7=i zccc$;-v=)vZ=LP+RIei&kLRNnd^MO<rKF_ti5|7}^^hE;na<A6(b0H<cw7%C_#z-M z@Vpzv0R+J0)YQq|cL<!7-<N!G1|&*oXlQbB^766`VD#=>z3ryJTSX8EaK_5cjuNZ{ zgK>HO{Q1+TPodDIj&K@fWf3x5*{;5%L`YZ|cC<A`B9SK7Bldtmij0hObab?8MJNaf z2?+>v3=C-BVjNvSk}qC)dU`&2^2FQwXJ;e>9UUDKi4=Js{85aVk@4@dAuA18>Aj5- zk!s_{iH+NDIhhk_Qmfi>;pwwi&p&KhZ!FJ!n<uU8uLcXUil5CIzu2X*8Gs!xN5HEB zn-n*PMKMku^K4qaf`UDzPF$1UmIJ6%g$MNlEzewYj^w7JDf!7LL#`JJY;@TwWJsBh z#9lSC#zaRT9ri5E<HT+DM%dvMMpY1;u(Q2QLr2#=T3%71PN|zN{U?8ywPdkwrirCx z3_6H(e)J~|```zuu+65128$+NAh2j2&-V5G=uZ*T($WfxEh#JO>+6e}_Vw`@7#P4* zmHeEainoNrPZ}B;w9~{5I0Yu$+}-`Xy{mD!JF-AhH#aqjii!OoNT(0>_V%iyf-UEz zWM#=`8@a!|qul@iK-ser1OicdKh{-!Nq4Hvxb~NN=1sa5_D4um-b=mUeJ;EH3eR8Z zrOHKE_MSFRrUw{f2a`QWXxid{vB9_0#8h-Mh8&JQ=ZK(0=MaYTJ6!D5<?2I1-VlV* zg!`9>)f`KxDRs@riU{+mN=ko?LJ33A#jjtyNit-NN1ZNl!mz2p_%kyPVQ@n*{OsE@ z2sBH^?MpbI$WP(Ed5MXMo12?iSvN&QZn{j>I1DX&H_p|$9iE(Q90YKPTgZE_QbxE# zAS3PF+Fo7<IXS%G+^VW7mBCB(sFt7;ZZ58tU{c^e!#{s|15xqf#S5EfR#t7m`22Z( zem)CNNWE{KpP#R&2)jrR>J|4|x5A~Tr;FUaU4X%8!%ZzMkpN;+q8AqzKY#w*)uk2^ z5<*?Tw=MA2%-o!Zho>MvU)YL*a|SaQd%TX7-t%aXGf9nN+;a6mAAg7_EONOgKHScD z{vp2uBR3AITD<wTVtNdJv1?P*yNCDDUH{?f2o<I#5(~nzDt7B&TinoIx~>p|S)<E{ z0(6bTz#qeEEuR)JybFDWpjm-ZOaNK(u3b`jajQTk3(pQha7l7u><z`$t3yLWfHU3+ z2?+|aadT%RBtT&>n2U=G6bkk5h(o(LIHcz0N|?#T#Kmc}Ewm@O-@SX6_Uk_zbZ`D{ zx527qcFE}9zi$MNqnnYDl}$}fF8rig4bnF-7;8lmFD@=hTbLbBE1R0m69u)6jE&vg z+(h2fPb@U~VZy_^f#56~LMO!PTz{-8%o;5lj>}HCoSYwS^ql|hRBOIZi&~q`xBc3m z(=u~2uu@Imilr{HTC6o9!3%65ykkYCEA}gXvtx1m7ineK6w?A!t6#gZwhso=iY8QA z&I5zCoy(6kE?v53dA94qLsr<cQaw34qu>;|BM@!vf{U&($)AUZfJ&Vm9f5&?FFywu zn46;<?cC=Z5(d^XGBOn9<v(O(Fk@i83~^9R%?t7?*Y+wE4>^xTTy?csv;XdJHpt5# zQ(o@u>@0sXHaz^#2$r7!Zohi<N+XW9XeBew)h;&a*$&Kw-L0;kicwYPe560Na&mgl zmfn?|IpTE#^pS&4ij#SHnU;-?hp79<2p`*ITTuX)b$nfFF|bdts1U;Ai=vt_3cL*T zGk&GhTfAOuX4ZVozE&$=#`bM|>sZ#ToKa;hty5N3XmO>KHk1OZ(CrI_m}>A0#nos6 zuy%T|k|th6x~S1aAU=nOhhs39!otGXSUokht|^SEh}FyU(P^wL4K;Nvud*`?#>vUK zxv>$+XlQI))J&9em2EEjL7H;ws!~CObU5AxgM<4L1#@z83=9lxY;5Ew#>dA+MMZ&? zgjMT3Vc|T`#?sOkhnjl#g(hfc1W?UnWo6x`KcR8B<U|i}b*+xBWkKYhT&n2!fG#WJ zWRWshfA4aT3r8?j4}w~HWS3fdBV(|Wy(JL5yo<?|B}Ctm<)Wo}8?Udq=MpF-g;g-| z$K8!!!hVAe+Yo1Hf|V1-pPN?n7L!E$@`op*9U0U!-R|oGs!a!98|?2lt8?WVh70@q z`9-nsk9y=ft8I^Sqd)#0I=+oVSlb?4ZEa6n&k26((fgPftG3|#jFdoQ!=zeNRAgvq zXdw>A#r_orfS}u4T{ajQI9j4IE-E4-0tSP{#8A_$92^>yM@L6v3zn^jS8%wT?stS; zm2^p_#P$wsXh1%U(b>(tSL^JH_l#m7{Z5jngoKQcVBkWXr(mqZf;spge98CMwJ8be z>S5&&qt}dEFb^#S7NtJx2xbiwzVzO+MSBQta{!i_l9BpmAO|*vk~E8oLX~7uIlDNi zQMTNZ=x^ipXhd9H7y~>rq+8oqCd8zwypa8aDzf;4c$eq>8>*VZtl)tC_Q65g$J%z? z(QJ*4jc~&JBT7Fw9B5Xywzq8?u9p-Se+4Q;!+h?@1|QH-3=PE)HYL}GvoU35T#Sr> zVq06(R8-jd`W_#?C<6lvi=DN#we4*HDGZKooJT}crVRjKqU^IytGbzH6+F`tBp2D4 zE*E$FjdyHp4BO`A?QLje1h`m6I)DBAeFEvNn9zv-3=fZw^)nN1BU{VbH`p^EjfE9r zArnWqXR7G^$h?@V@F$jK`NS7~scX4bsX;gAp`bSfg0KKgKDt>(G6BC_I`<E9wRdsq zrN4!G-Bz<)3LZ1hF++|a2%Fn4r;@y+_%o{$AwXNlDpVT8r&$bo3IH6;$!QFa|C)4L zUS3}5)z7Z+aYIwn`ntN*AqT(@Hu4(v0S7jYjvMI@9GzWU4$m(9w(&q#kMZza5Bs92 zqhrzNZ4W?(wl*pv;)&Wde6znbI}6Li+}!rYhDH{%5Gy-7fDLK*N-WmP$SAqFnfN8_ zuN&Ol&+nN$c@puSc6U!cZ|WU{dZ_xMW^AWB<udq=(1^)|bl%E`2y|W{x7UG4#54bE z^~jPSVlWpjNu=Z*|L!M|(XZ|UH^2}RJ!t+hD{c29*KGeI<bF^IgElnul0mK<({G-! z^$;f%*ZK3-?YG4vq^*E0l<4A$l&!rN7^q}FQ&Lh+za55#g~4~G4^9tk*-<iXvkzTe z%n}k3fG%V8@XSJ-<FXu`kzr<UUz9ORBuZqYryrbM<mU@GIXSVguw?K?GBi{2-%SE4 zV&9VIrZD{s9VO*-OVGZu;->s-hLE<(%1U~AdO<<K>gwu{02zsJ%LEx0{5|u!_Z|3< z>R16~6TBz4y@xe769D^K>MZ7ZkDMke_5Y^CvZi<~j{C=ZdHXK-`EWX$T|W(>PfAY2 zXG`DkF;Z1k)z+r*NzTj5yA@~((gW~0Bjc2OK>u*HK-ppNgM!ccKYP(A)C5UGr4$6E zh^<XwXubv*7(zx|SW+bQ-NEmaQRM!cS^i(R<bSg;E`rH*-6I*>uQn}>2G5XmzA7sO zSdqqw7ho=Lzyn{}at!9b7#&bhNDD8Q^nt+9SR>y8h+TV0XHSpgKc1|_=^hOAHHzgV zYk=myzSIR?q${$E39R^U+AGOuLB0xX|9@!rzoE#Ne|e7UKlyl4Cia_aYN_j~RjOKt F{tJ^y_ig|H literal 0 HcmV?d00001 diff --git a/docs/assets/design/hybrid_kv_cache_manager/memory_layout.png b/docs/assets/design/hybrid_kv_cache_manager/memory_layout.png new file mode 100644 index 0000000000000000000000000000000000000000..bcffc27a716497b2b280be176b4f40e520dadda0 GIT binary patch literal 63113 zcmeFZWmr^g6gCPLC5Rv(A|eJT9Rkt{h)9aWPy<SL4xK6~4bmmuAPqx|2uSA;5<>|L zIh1sL&!E0vob%(H>pDNr{Ll^T{lt3Kx>wzM0$$6%BqqE?h=YSe{7OpV4GzwgARL^F zdH5H=Zypm)C*t7jzJ4X~T-hmc^~CeDmP*pu8HVt^mVFTJbzkc4W`!R!Bc%I3U+KXW zUO6VL=fU-tpa<IfZ6Ay&amo9phtxBe4&26Jj;p=ic^kA)vdj0pX@XhiY}xJXXY$}g z0>t)kJ%Psz-q&!U3wjJ+NJN9X7V*WFPfKR1%X&ivc76?tWFH==u|OP$a+Qu&?eb=w zcU;$#@`g;Y3j`l8emhydKbJVMYxqllFPr}VE1ds}44NS&YO<Mca+UK~UCS&qF&qiq zl?HxEiY%06qvlLer558b&9NqaA_C!i&I0k(pU7M;^ETILX--91jN3w5bW3*X11dc} z!`GEYIbWUazcejcx^jL7J(X)qOG|DdbmO!Rl3)(4gcVzoJT11E&UuYOL->o}$CQEd zw(9tbY%vz&l`PC%`R3zy=;&hDSEUtizZFP|7HuP9i0S01&*ufhZ~YxUSXux(I;ZZq z15=cAHqxRl-Ey5o@Z00N(ytSm5@KUtd&<Fhx_|$Mp)3-aziN^Ts4ETcUL0h*x>ibn zh;(rO3_k)xp?+_MUZAH1BY8RHDua<8Mz)vR`1y#XLS0vaQd3iR9mL$O5)<bld<!H3 zLmL_zBy}R1nK|kj8bq9HDjloFd{UvjGZmZZgDoi?A+HWwMNJ=@65)aYo5Ogs3U&mk z?ET_ovy_=Cs+&l@T8vX~GJ5AIGofr061oiyhE$G2rFbh|Kl;71!!Kvgprynv!{0s8 z*B3UGBa6;?VO#Xg*Eq&(B}LuHuh0=fBPxwL%w3JW<bEs|%73oT&5GnZ(x`0B^|z4` z3jE04cBhf{8-zY$A}Nkcd4q#lT5S0?$)?7}Ihrr)B%4bM9EK>ci!H=1wxe|l-?tWP zEB@(~Qv{KRsfuGfk@DH|H#D|v_^lQ4^T}oy@-E`=iJi+DM@TI?Q|{Tq94usW(`IKx zmI2vuF9XMr`|qFYfb=Xz@v<$8ii>;Wpv;l&`do3+ncrWF;K=F8{~5wfJ=#D$x*)X7 z!qvq^V*$1V6Nm)sy)2E!DZd;A153fd@#Yd17UtxfITm>Gq*==kXUyl%S>w2kR<bfC z+YK8QDl;v?G){}l)Kgz?XpT_eeE$+b0-k*K{YGHRU~TEBd5KyOCK&wBn=mvfuPJc% z>Uib+<RH|bY?Q$vvw~y3xFyNY*>T7JQ%l<F?eR*bT;*>>23BqI8xwb@zkKkqDm1r7 zv2_@hEy3VGf&@vC%|oU<MxAaVDe~DTA&-K*enr$0i#Y7zJx})cb$NUbBlB>sWvT9K z>RN(p@uI`L*2Xdc@pj96-D}sW5O=&QQhlw6rBl8y=O$4y*I);f&7+FfM~vRdYT1lC zP8JoMWFS0Wr?)(SyVWS2Nw)PY5lS(cSwwwn;^vN=@RZ^T`RG^}v040fmOR?{<d<Vv zF6;Mc$JMdx2L#T<PuCX@m=GZ<Iyn}P>0+$V4Gc}I-Rs{f7AOcWTUcQjF4v`JRB{}@ z7{64=oY*RSRA;GI9{LIujpu1t!8^EC;_onzAw)1g7wqq;-FL?uK9M&|2TGQJ@l^VQ zmCc_WI;q0?d}>gHbOX6@8sFq4s;9rL(~2*^-c9z5s$^UbYT|#iH=c`^DLfy?c6ln( zuQ&OE!^4CO;rvNBUR})8l7se2jXJ7;<4XlDRgg~p1axhLKry=t6V`+3wb#V#MTAj( z6cpY&p2YvDd83w4USr6vutjaG)=6Z3@tFXf_3Mg#u%boJU;nKrba}!b9@zhVBFp#c z(qL&I{D`0-vVf-A8=*t&#LnowEGve7(q5cjfzwGJ_+4!B4ql3M1$Btm4{sLD?bT<a zZ!NB2VxIa@TAK_JFAWAxSTWK&W)du>@!EPmY0k||dRMAA8cDCDyyh<MVR&(uvZY9j z@>`yQ^PPss@;nH%tpE}HfxUm<4J)Uz|5in_p#F0<WchfPk|9H>ioiJ5Az!`f#Dm5- z_}XLu8f9pWPjMOj^P*VfbUF$DB<oW<a{--g^Sm3Sq^~SI#T$(`-s2e&HnLt*vKwrF z-q*wF;?UrN`JhR$b+gI*7oW?MWvI`T!ZW?*T0L{&J<3yQW9SEw)51N(b(Y&$*tv@f ztUp(IAiAfVC107zgwMRNYl59>^RpGzS9P`+CC{BiuL)-kCrF+oI-}RHY~GeXvi-Ak zrmyx-N+c5Lq2MDrhlAUhF;=<8^6TzYkKHdf5q9sJw{8(SQSHpOqql>TKX}O-`vcTZ z{rRbs=><o{e2@LgD@;vsad8DNZ_{?i?K^iIJfhJ#6g_C+$o5MxXl#?!ucE|I&;65X zMD+xH1A20>d(8g%FvWJLr{>`l{vQA#_%g4{#ESoVed^X57RscN;reS3D+$z0h{d@4 zA?a6Uo_AMMdZY}=1+L#TxxA+x8<f6~mkxXC9R0anU#OTE%_cnEaee2Wkb;FmZM@WC zb8d|LHy-xhaGen9#|UO_H<=IUGV<&Np2qA_OhTm%WEQo-`LxJ%F({`!D=WaP+Me@l zzpDOB&^PQHOJ+^@^n!=QUy+lO_w?GYPnB!R6&6XH@Hjy--@FR3l~xsF%tV^>7KQxA zK=N;G6R{_j#Svzx*!nz{C}ruSmZC|-@$J_VW6Y(FHsa5A+Vc<k+;8vY!ddRbHdf7y z77y=5e`FWkQIR{dZ~Xkz<Cq+EW+(S2TG5E=oF6PVaB+bL(6snk4wRoL$javM?V3)G zXjfW<hKAC}E|ym_CdnId#m&sTD-^$A`c5NNRq)fdx1FN}sf~{GO$AzPBQ8RW-j65) zx8N4{Qpv@D7vuNKg}x`LIdh^Ut|$s_l%{c<m#B}E{UH^n2_xIwRDhk44`6l3%ZmPu zmn_4t2~JbT{p%D4vI+`SB7TrSecHIOAln)iaq+!+7uVd6{x7DR-<iHw&>Sd#)MneJ zy8FctId5B|#$t-ceAHNf*Tm7d=q>Lul;B}H-K|@n*5*eNCETzZz3O-wOw&iaEUz=< zkW}2?hY21}VDSba!Nj}?(8whD>^Hgo8kyZM9v){4*&#SS?UqYP()1KJGK*e%t>9hj zp|q=372RK96YlLltm$8rHv9$PRpiez?{gm#b~Q@+Mfbgv1@C1qEjBbHB}zU!U5!yO zu;fF2lR)JU10+{mkPALoYmK}Tkxw~X$f(`laNM7vMc<)pvHAE{%Hk<w#znA39H{xr z0Tc2}KIr=)fzDFXig+~2wnod+N=|M+ckzM!Lcz}~4$%g=;bzFp@MZZyw~X4DAb)Oj z&;tQhLP~U~XWCK9AAPD?a^twp@w2j?KHNavBKMoAp6fAYXq@7%Z%SZC<fi)Z1z?54 zGWUljROKFS9k`{tM;UKk9ZrZ{<$6AQwyB`?_iz@km<shvaHP(=1T&b89m7nw$ZWMA zj8}^Pu&~8ApdlY4k^FP@aNg2VD6yF(lYmN_A@GVldl?Q-<s@h{UL?ytoI5kMk6EnS zK&HjXjcZ;9tDHUjyGn?OB^ev*2$7OU=H#h~8Gc?|l9odmX7WdCYg4gAc;~#VoIHeu z@`R*SSV?6?#d|EtuFXffqAVAMK8lNt<>2D2+!>JC=?IU>tn;d^4JvARE1U)m$PziN zv|!XVH@qiu8i4WqWAFjllFD+S_Co{p8znKW1^L6i@bHwn7v?$h8ZTb8-OgjZK=P6% zczkyG6PP9SHNI4m`MB!(kBTyNz60Q!LC{E~DcA(PHM)3H*x~@+`70Em?=7fiC6H(N ztH_(Vw78b924?c62XZX>zf6k0N@i{V_IN*EaHFs0J?t9Opyb_8-@>FO@A3BA78Yeh zHdS1(bQxEve^}dczykn`(O6?N#DUiUHa}v{&t2bG3oCq58W1KGc+kA6#w8|bP}ucx z_y?2#VvUUsK0rXM;Yv;5pFn;V>S_D$_)BvyQ_0DyEf^aY@u8!8*|m@ng4ARte4k1W z#U}%KIo)F$Upx?#yg7rnG3*{FdDNN|8Qp2%TwIhqtXmQbVnv&<(c0gXRN;NM=(HRL zdCJpC_VEYTx_B|tPSBzQH|?V<)*`2d7iSro&wiy$pB7^c`58;DBO5SqI~tzMJY^<a zSEZ`o<C=x4$OzhAcY#{MpP{ah<ytcZ5GEj!1}$V`uT+KG@oDY_dGi~1j6bn?ot$vF z0G=NJpEAwvDOh)&)-ciE7@E(zV~$7CDre^TEt=j{r0aSO`f;Z|*VqqC=QGRuFG`qu z>kkOFALH+T%5jBODE@U%e=Jj1x>f~yZ&IF%xZ0kie0~)ES;JWJ%4F7+)efa^YmRG! zPn+yy_Uer@aD$`qFU{$CbmzF<i*L)DOYkM&6t-s=Ahf#-FFx2ff*cjf$yv(ohpM&p zK=_FEWA|G)O>bdpf{!kM^S`j0RsXf6iH~gHfY!D^<dk(6asBTSy31e>^@9VN$|M=y zy6!DzGfWOzLGrx3)86D(hYHUH8>hbZf8?~a8Yh<e`q7QV;Vb%C_6pvvAH{_21<RI{ z`q)alnD)Ot?9CTGZeh^5<8j2hRmUOrBdgGy;c_#lJooqV!us001Gi5@Bfg$0Z9yUf zgO6?K&K3@*?*8+1lb7=NtlnuaI)doK484~{{Ez{+mT*!5#VfcMU3eR6H}_tdiNX<m zi4KDL#5+2u%HT)XBV8`c=gve!(J<PW;6rd@j#JX@*`(F2XS;T7-OH+?mw$_IWOtoN zZ9K0qgJ9|dzTFhw-)N~t^UWEqMfHVxil11i=26ZL{!NbkN%GHQf1oD5*Uw@9|C_64 zj!cw+xw{~%%escY*671YV;vpoe~y7*0uC0H{2|#Yta5@+>`=}hl*PfhdjJ2?6?CyV zfV?$?mK#NQmqqp5jR1^Z>`N7AluslTW6>x#?oWH+t9fxyTng`e4tQSxIU@<Jz}&I4 zvR`b18T{|u;ermUKTGslDOokj-!^`@Ohg%LN%H02DxhuL0g+{c?GXZmE2PYS{1pxk zsy!~c=bGNz>vVrq3FnJoJAIKo-~BxC+oXSF=!MBBq`e-LLt^+>s9r@p{QRSRyRr~G zCAFu2gbynO(_K=K$qsMQQ%HPT^0x6pngERKJQ2mg`Sw^o@u|&hbAVoJ2<H__+9qqr z`A+&OZpDM?6=?<DhWtscL6$iNw!py1*u-*!=t=NjDW4BtZYH`mMll`mCr`wA#%Bgw z!{_s@O}qF{=BR=e!lz^cuq%HPuP+G|WCKu#QG+1c7e@cKV>ztVQ3>SpB9Jn?FIDd* z23roo!GUX~tB(Vr<)n)HDZct6=?x=UF}(Z#Mu)PwElP^v{Y@Zna3;Zy2#|8$SYG@) z7%kN3eB!G`>}J6M%9j&7rT8m>MGstlRN9L`eKAHncruU$IH)93stuCAsb=5Fv6z58 znSi<iqxVyX)tmgZ0zv=wF4`N0DeYuKMh93gFNM$w{y2s$KKSps^01yb@1FWhCC!(d zv;qNtAL=5Me*n^(SAmqQy4ML{H~%gp_Qf36b$h<Z@vr`u1Q`Ao#l33_v<)oFo0kB( zv0EA;57oNhkN@{6FFvEVk?EO|XDk1NHQk+?DhOL74XnTS=C9)5G|b&2`_Q6&o%8R~ zyzbf-F#oSN*tb0|X4(C;`1c@i27#7*=wz#v#kX1g$;pr{e&b&mYgzN_TiigYy8{2l zPJJ3!&$UQl@$#>vNu#w^qh37dbNF`yaabg>#Q)M*;NSg!;q19bHU~4N`nQ-LPXndC z{$w8W`FF!Ra06IP<=wwlYV4~eA%?x<{l7b2#Xue6z&wd0toQ!il+4Xf%TKHE{O>yY z>VVo~*Tzp9FZu6xWJV<r%J65%f4k9`XW8xe0(s*1Z(!m%-D0e`VT)l)m3N*0-8UvK zFW;(LZ2YHu1AN^$SaTRMmHyu*Eq<}wYXmp^JC|pI{O6nc{?EpZxoiBr-<CkU|67a4 ze|vcMPwW!k;}Sy>N$fL0=(ogsuKPF=^NxrkRDN;8;SU<Cx=rv;!@&afYIoc!!BW{T z(P7MFd!bwKR}l9?cLJZ)WL;UWi0kR`K^DzrtG~<pLR5cP#5d{G1k=>jpR98>O0pjw zHA*>Yyynpd)3hf1r?DznMLXW#9MN{|hKcL06_oX8R9mNNp)S}w`)5ea>z-#vzgj~P z`?G;rG!Zw^$*!31K15S{4SR8x_<sfV4C1si1W9q<9j%&l>4b@s8Db`CsI;sb!K@Z= z`ZC(T>MK=8rR_9Bh1mB&gR;o-2^bqsWaE8}9Xi?La49>*<1mrmPGCYvNj_QVAY1fg zNx}0FscBWeQ#*BnnL0a}I{EMgaWalL+3*vcm|1xK&)g%{z?dyW>~+%~$Mx&(>pI6R zI*G2Bnk1_?=JjCVXPb0qn|`9ljVFUTXFEEct9o=RZjpEYq3EUUf>f+PMSiDV1ENxw zjKASoT)#Wk$&NT)NU$;uRZ=h_{kMfYh_>^*=V|X*uJHc*Yi`p`bf+7u)X1?!jl$jI z?cOttBLaEn><Dq@x{%HNVTI!SSRV{v#Rz04@4B&C)~j7@O-}%Y<|>=ceD}c!VQSFH zM?e@>8L!H2$FQ6~A(a7yZe~`K41|hKa!k(MebDcWPRgE#0dWj({%V!0Fyj5EZS8gd z(W!LY6y4`8((U6AK*@Uq*Xs_p(fOfuVEaJBIR+C)1F>)|cZl{pMVFLt9&7`F%(Mhj zC0HmNw^Ovn`tv=2=hMR=;)_Ka9*4-34!RzJl>)?3ErNSIcHd4ldTu#y<p7;>HdO5K z(EI61L7B*INoeMq$4J&sf(_L|J3Bi`c0K$(#a4)uT^aWJ-&>vRXJE2y(plkk=UzO~ z{pWUtGC?Cco=|)&-hg}2<w$wEJDy9KF<IH!-4)CscRY`6&}h$t7*CU<Qv&RA<GGDU z&isK$zWL90#+>~UJxxE9s?azY@jQVjtu0;f+~)HHwS{z%t*NskY_l-%8Bts7Nh z|NJV!c7TGL*vU>6;)Im|8hE+`Gyq%|9{LZr```L^s<B}3d_=?XR%e#RfLyudpa8Yv zKk2@*ZGRZ!wkS?#>mlsCH5Y#W<<-h;v)lZ(b93#HJ;ii~eSW8>>t~a1T@7g#06aZe z>f-K&MZ3id?v<Zyd%6u7+u`6??DmO@s*diV)l1Go&q~_g>h&2kY=%x9Hm#rC_0)Eo zNu$Uw;uyd4peF*cR<oEM^AvG<8hqw7zeD9idS){#>aqJ<5xogOdX@cbi=DWfy}jq@ zuOMJL0*Hp+12Sj3R%eD^g*O!+-S9YB(Gf&QhG%ydA_#DBcCe74({OMLS%`yEj_eJ+ z0}+S!dd>)Z41OL{vz>x*5Q8HQ6A;G;cg&e-rLa^Yx5pmtW{T(OPL~}zrQW!KbhbsO zzQ7kocl5KB4ZfJx!^hqapkouLyC6avNat(~al-d;Eg%%w+)Lu6v~Pc-im(T{RSI(4 z3mlSPEe5#+=$iqMy~}%8_Mb6D47MW&b<l~?Ttg8UI{*RT3=S3&vNX7eI43A@yh!2- zlzm7qF5N*Azq;~QJJ}@k4VS%+Q>#4VMx(zR-A?Ae=dCu-+1Bm26b18*@23EcnZaAF z+&KJLg=i@!W90#xJ`Rq6{%U2?N9K`Gi*XY;eLF|B?a6e<4wvk6C=B8h8I`xnh9rJY z6(1+EU!!8SUVS2u{c)_^a3zaJ(?A)IULwyydP(b(dAd4?5Xb9X_b{m1q?Zd_{F!`H z9!F~-e5SjzV&^4T5O3;LGvnaUmhOeu%JI>^GA*Wig7SWlTKk<wYOfFK-{u?4XZ6#m zY)SQt;K|%5o-7|O$^=A~_wV1&(tPju`ftdCbMv3?_<wix%<pQER!TeqW<Cx!@yd7b zI5YP^hLKt2T!zeb?b0d^F-yK*h}M33pbyYmTVTU-e3zh$9Yn=7Ti?w{dT(>0NMQ*e zLJU6t0G#*!=bLeY{zSQ5l=l;cgUz{6{&PQ%+E8Nt?_r@!z7H;<C?=o5Upi3rIneM< zPK>_{XSioGj$sw^e^^fi#({T($ZNPjT@Oy-gom{~S~0Hy(*F)N8=UD9!xzEpz4QVg zUgAPss{1%o^v5xJz5C~Uu~&;%T68%Ry0ZZK5dR6fjjz}Z&^Mm<ci3Y(vuOTA!+jCz zR09-&Eucqa*eT1P#1xgN18RQRNP8U+a}DZ<4?9p0`e?CHeSMB{9GT>Ef&efP!!M)* zMI2IHV`C|^S0!pU#4hm_Z+~i;ZElt*n624}QI)bUCWHkY4?acnMYdOqKE^JQ)JRE5 zDOBVzXQ~Wf+zD`!!YOYeiCLMRyLQ88ifXiqRP%>SE2J}LYo4dR4x=%SNitC1vY|H2 z1#@jPtvx=F1SMnk0&J@&6N74S+Y?Kjo{X9h-{^dJ-bN5e>OpUtPTU0*JlmSYUaSk{ zO{lQYY{;l@RH_AS9%5o*AEAw&SWqiLU6M#sW9d)`CQSo*CgEjz@8gWNPy*7)pXXMf zpqdAq`Bh*KNa##IbVg1!ue4ScD1mY7uApd@=~~E|V>~#nKA`=4g`+M`3;s*KcN-6n zj$TSiQnG8C!|6A8R>TDSg6hlhng-d3uU#`^e;g%^TU?5*3_JgLmHDO}lqyTbD*0nN zLGy+&8)W7c8mwJ+5}kUS-(|kBDV2J4%4GKq>_k3$D5g2sK)pt*r?fzA9u_fH&qR%R zdR(~~2zw{Nn-H~`D5O)PC4X`=3%O*@YX%RNig;Bkpq?vYNuKV_rJhU1@AJ};#~ULC zGt44cO%Ztur&tv8GYSdoOJ!9p*0z8XZ1A{>3%nwd_#UJasrMq2<ArsEf%=$hMRJR= zO)8x_-dvqY=9}=DFX$g!UVXC<ycuOYoA8QnvHJO_kD9BJ<Q6G2jq%&dE7fR4i+_L5 zait)+zk8+VtBWk(18PaWVj5JP<y|>PA2CNB31C~QUx;>gS?djcQCZdO7-E40ZrP+V zVo`1SYW+nP%?Hb8oKW*_IRBEy+|{!#eXgdaCSn6lQ2M29L;ZF$-SPpwo32{H#~+RL z7oFH*R#sN*(&_^O2(qRKb6G!b&u5BNC1iRy2#K2YFbbsf?(OXGBz5VkQq$iszJwYg zg)`TI(vu0py@7E=0oI}b&OrAv^8wC0n=^EM2o+BpqKPkx9$?CdfAq~};oI`cN`9@4 z9YNBK-tY_?RkoP%pL#ctaR%d++oJ?e3wum%(b26-l%3llDln`FcT1!$DpR%`H&2d* z=Pus3@8fi4%0SZ+?Y5E<rae+9tP6A*ml(QCm(7E4Q+Uu%I2X^upuT%z@UQwR8i9et zW+f0%>4e!!@&Bx}q)w`2G6T<A78Fc`@sc=QReBuYpOextov1+@db`&??A*d%J(r@A z)+7^nl^<qm{5er6Pl@m9fWx@-?0qU%i^KUcKi86J)wUP!$E(G?v)M?LsxlPiHXmi3 zS6V^(T0}BlIUd_bKo2xdRhH_sqEtB>(`>X1(@68-5(vn_FOx&^*vMUdd!BGUyO%+9 z5Aw-aPmIx|f}cL$Dv{nPi6uSq!^>lUCZPA%ZK(7~x8$d>N;tMh-6KC|Nke<RfJfHi zjd<;V_GI&&dk<@6EVOwOh>_?Xq6$65JxXvJQH4h|Es!fb(Z<?!0xj%zQ7E*HKhd0* z##Xdva!_Z*!!1^rl$d(F($0GRuw8lxCLZz5dfo-jECf#Ve?xAxY9XrRe{)mU&VjqU z5d?>OtE1ezMZ&JTw?#ebQiWyZ<km*ZUZMgfdItxUB`hrc3mgV8Tk#F4mYb6k?wzA9 zW28wnSu~_NV;V&k6Lz4NBV)gHP$y6pMF*<a)7|A<@X8HAxADr>*4FuP11{=OUT5Q; zGgYQ24tuvAv2q`==UI0^H)AV&LF|pafo6NI_|>un--)M@?K$c0NoAB<U+$&Z*E&_* zDLe1ZyNRriKznRz9CL~yr&mpNb?;S=5B2)n8SpCmX7RqlJQrR+aBFz|=#ra&P^gq( zA~>^d!zLZkWwjAWPLA95a$`g7()Qo-NHSbys~+O&Pd+JahY6B>z?kBjFr;yF+dsoE zc|HM9YLS;Kxpr43YrOwOR#w&>_uq=W*#N)lTy{(kwv&ZjL+%JK*NlFmU<S}egUWXG zdl`C1*yVT5VOBSfCI;RAlCJ*raD|FZOU)6Y_}U7hvUNuZKfGPv*UwLU@^$U{N@m#J zf=BT64msHP#S}`#S7=taxBP%9a+#8tpXm7$MM^5g#JSJZg!BN_bCo*)a%25K?nPVV z63qH3>q_CAQMzjW{G<x-Oxh9rGW)&<?h!4y`3^3i49$fGgcB@rf~o!f8wGXL$4?>1 zHPC*rNR6K~Ma|!ccm842qT64r@BzYNFLHl*G=}O~?2GsF&M{Ed&=&sn<vZXIj|TDR z!jG)u)lysf7h4&9JwH7!xT+?co-hJ^7vAT?1$Ra*F4$j+r|*XnhocOz*jeu`V>P<p zqG|asRs0BAHIui(t5uP)cMnOtbm!#c1h_YMR5w6YTEnuiqpSYMK#I4t@EBLt^~I{3 zHfO^fYo<0Ejxc8BPl{OwKR?5#x5}@eS+=dYz2K`+TO3n6VA=Izy1u4v$o^j7SZ%rR zlh3>`9Qq``9CCq~Jh^<qsHg%A=<wU$zgS4}NLFj$HY1@;RYxfc4>#c>IYNx-PIy?# zZ6iRXXgsl>HvhF}i#WiW=OX!GC%{96JQRxQ%pJ`cgGL*Gl%(e9d0m1OCE;bXpv*YS zbO-Ek)LqWhox=LA7=AAz?b6Hg#+MJ2MAvdR)HTz$J4|YvnSGpMi6wc9Y3L;h<gjS| z;L3Sa<A)KGa5;*x_~~S4b;fm$>KGZwIj~lxjk9P~Tep0oSQ}$;`j!4Nw8PsH{FK}t zPdW>9h!}qaQ@bugy<x)RNJPaJBrcD>lR7`4su|5ABvc@xvgfN&=w*lLLmgcg@p-!Z zg?Vjv)V=mrequ<N{K$KtPtl?HY*~>pF-Nwrm9D`ZQ!`=o+?&Wiug%b&R?vl}f$HnI zxoIUoH44fQ)Y%SH{M4drU6Q}+g#L9C0Uj?^tf*z_v}LDaM(hARWQ(zL$gx;}d|c4N z@yJYSOrbRzYxYu+D>pQzBSrg{b8E82$Ye&ibp$Lg7geO`wlM+d=WunKyHB_DQv_=u zCp#mZ*GoOrC%1b=*AK=3(BvHTUwGQvSm;sv$)A_MfmbEQp_b{w%Ni5CU&}ogi%{hP z_pb}nEYvjZPh24zNV=EMEkGsBEI#+WkF3hen)Nv1`aRwe)z~b75*bv$?(KIy{zgI> zyOlb|kLa$Q9_NR@7D-vx=S)IEaozJ@7-A4tj6FGXhD?)Pi6Tvg-G|r9^N<WzdVl0# z=)061ZAm215bB`=+WT0a1seeNJ@}+B)$KLS1J^0{y&?B@cF%*(jU%gY7H5-L6$~C~ zdo{l*_?;!Dymk=&Lb2y)7?pJqe@Q+SN(g8q@U)kmrR7|l2($0hkeiA!L9ff!yaILy z-{f+1^6XPs`>3;sos!<Bq1Nh9a)4|Y^u4}C@j90Jl;dKAGy<7N@|k3^#({0w<L1TX zt$1>ju?==rWgW9EffXtYLe30k$rm9jA%;5%$83a{y!(46UyiQ#zPVbs(e19Z(C~QJ z@**mjfP0pMft%|3K+<<%7Pf)Lc8wdeY~OE4$I}>I{u$CfZ@ZNq(XJnF7JG<`Pok>N zsXVyzy2p=~Lh6WY>1XTDnyaAgXH7H&vCk?Pu;Rt--OjY6W*{^-uCpWW%)_qd8JWG) z6EmD%E_m49I>xmfzUK}Cvi7C9pgH@ghYa^gnqonHdsA0k74&$1h;}BjeL7gOt(j<U z_TJV`9tjS5|2?MPfGe&?%M+ItG}%|K3?;=0vI}J+Z{?}2sxi%Fya(N)w>&B0f*UY_ zwVE&oFpl<5;{yICPcpQ7D=oMkutCDNf>P_5@1QJU7G?8pd27}RaCz!mh`8BQ@Ii<0 zLY@ym-U{Uh&o7b_ED3?GX2?8D<I0Q&l{!4ipNu}PyS*t_P@Y5eRpqV8);!(*8U1*1 zad8{t)WjX7LS~kjPTNu9-XRUzcUjR&H6!xD0vSqr7!F7-r69v>I^jnK=AqWLWeU}6 zM0LKLl@{5gA25Ug7*qdMHcF=#<K42fv2h}gynlY(G(=iFuR3U~q9YU2DqT%G&u)mW zs@1#f3=R&ahl((HtJOJeGKwC&+8NEweJs94mR3QQ2x7P$k}B3Ap=bjYCJH0>pJJ(B z8hk9qZStw#a83+qwo=XyU7%vU5%$=YA<{%3c2Sm66-$yU;5N}HR0YxSgj9=urIhIA z0k5K25beKiwuAkBd%Hz0y{fg_sLE}dT2pL-3%5WoRwiX>hTl9-aG6j({#~+kb$UmR zy9#t1Ai+zeTXkB*6THKPB8(lJniA#0eoBF3G{XIqo0|(tMAVaU1JbKs>MNlHzGa`@ z7OL94p#lv^-PDpi>H*S$gJm`O?1!|b&e=ir7Z~;(S2on7oTCIcwQ`j|i6=qQb-Bd# zy_`G(=4|;9n`hx{aI#!V(dQYT@Ohs_Dc!#^0E1jyEb;{jv6R)+z~rex%pNN@c3lta z2m5Ajm<m<twh$t_19*6nLGECqO&=D<CS_zRz&d0rotdbS;cIz>qE>Hvj*@#_yrnwJ z|EtApnY6{}ibIe6&8l*F`Cp6FLm*X}j)3ngU#?6O47otdUFwXQe2FpihDy28)wr(T zcZ|&NmT>gp3&UVNA3gAVC!Gyv8*D$|_Xw-jCT&6LItL>yTCQ0sMl=X6WN#17vVK(+ zW-JSQ+O>`G3o8ognS+gyMRTb}2G;X(Hl_|w`-@#N8WVAjsw&M_o7WANj~{mK&n%K7 z@?RDjVX@S}Gjv0iF8GqoU6Ol=ZFk(Y3852hFzUTyZ@scJqO*S0-6u<pCGVrAW-pt7 zS6ER437EiZ7-$ZVeAJ)>Q|emb&KPr?aDHeGk=YZ+sC?)xE=d^Q{d@Fwak?eO8CN*m zmd9prv$^t+AM$K#3U(u+U-E5I7M627s0g<w*WY=h#)Hj`pGkm;JpdZ!n$`VykZNY| z+c<WF*Kul1wrjpl@t9lygxhZh`9Y4V;WYEPpC??}*p<A{hX+^%(gZPGyrht>p|gDB z+7%i)y3iaZm6bV|0PTVb2m)#Iam~|e6BGFev9Sp;CSZ9`d!Xkn>Y^edL{K$5i<j#u z+0bdl+dpM!#V~@T!#040zry+gDtm&~oslEW?o}F)2Ru(k5YN3~?_MM!O=9ek2C#@| zQnMjhVrn|Nk<#y#SE~j|3Nd$;sQF(&ax>hi28GMExTi`~xXSrD(A$lS%98ovSsJ+6 zvPDj7$9W4{9^4$h{UrhwAlbJid$yxPgy4As<bd<#g@zcTH!$6b@aXCa+iXgPX)~C7 zo^DAFs+9+FrhkbZdhuTO(xBG2{tknT7gqppFCbC=4)ocLAy}cR_`C;~ZHGQrzk5-6 zF@>|?iHoeqf}jv?+%R+rHpb2h@^e*P$t3zKT8g+v(#*Gr8(5NVund%~XPqit50lQP z*L&0O8=K!e=mg=YEFZl;x-jfWvbXH6e67h&=h62V!qH0Fvw($*qysz*1Bvp1VVW#m z{dsW`Zwr1gFnTe^bYk9(kG1|d87;FAjFLWB8$K8n_67-xDQmffX3x&zWJCQ#>N}9R z3TIUU{*1u=aY0o{#+qxQDZOUG3owWgS94gPm@!qQ!;8!7%?B>TcDv(G57mgfPdZKp zaa%t3(`SHKyYvkY>QnQV+FKyU1(~)4$h6zeuoZ*Fjjr$}?w7JvX+%D^IS)H^PPvw& zFTF;mZo@!pp{KlA!LBML*+)9Lmhh%*mYg=-LJ>#W5YPCBFwBj2rri^%Dj>Dbd{bCV z|HVh7kZPgYPQbC?S9^QQcZqAJ!O2};^@kGR!4r2Ay7K~pjM$?8);+on%RpG1B;OgE zRX?Ab%-EUqjgTJrIDsVjdghegPOYr%)Sx4C!pznO*|l7N1bLNHKZ9c9gd*&<pz!J_ z+0L^I4zohqxyk}M*$Zw1V6TaR{Ld?p9G3rgbwbk!-E~3b<AuG_e9KqeI?Kl~P<xrp zgemTyJ+an~RL8Ra)lUo$r$J^`|7%weTT?Se#up|Gn?KB6xypKDyv#@e6<e@9_!bku zm~6aQVIg{ZPcG%i1L<zIR1@)0hcn5@+3&VJ#IdX3a5%hYauPmzXMo4iQV1eroFm^I z8cAK1e^r`6a12MB5kJyEm!#&FS5VsA0Aq6hM?oz+U6#0afNinX#K1>dbS>40je#aN zIUW>|sQ4=!m#!dojpI9m%HziwG%#_<n#(Mj$X)jP2%E+psvM)si4J!$i9L_!ZtS>d z3a3?`2{)T$xRzHV>%0QVpV5pSkr+Tw5j_*JQ(J{wdPjrV9ieNTRtKsML#%7=t+RZH zY{~ayp1Td=J<+%c6~G9;Jcz$!ShIY}xIS94a%{&Ieuml@O~8znbi&fZ9`q`wxGql8 zL7iuKuAz?7Rkhj9j>f1ed8$&X)k}1Jj-E4Kh$Kit^LiwEo;vS+#Avb18G0l;%@T~Z zfmqXRD>CKxL{=)HVDE1A0ZJLp>^-Q1>W`hOFPyR{t`(40rSi#rZKJC5*-qSfftB-d zu?(QtA}u)5)mY?ZWvkhX#Zvo5MyN;$0eflsdXN1csjRQM1v~AQpmvok1GkWn)*cF# zpJ`k?B*1i+pAa+PRb-9mE8fHm^Olc?vH!l0z!ze#WZ0rb)W7mLSv_Ftz0!twacLb{ zhMrGhPwXtf_{>>G$Vv@We7v|j?JS{KkiVuS)Wa#zYMIf@uG%BC-9^qQ-<E;~eY{Y1 zN8@_EE?ywJrLN$f;y49Xc40Ta-7lnRUw1ldirpIOt&*VjfzbrgQCq^pTb;)mIXo3D zy!&ffpJsD#TYWVHrzQ`+Y_IfcyNtcnJC}L!$?sbTBR<Q6<zSj&hf?7}O^+Ey9oL<z zEi`PGJOQyuk>0)JfjxxjBES4yrLB>QDs9&x;uMp`c^XLQq44<P<k+n6nFHoAB6&~# z@<1TM<)GiwW5W4(8MG!B(*i#@&TBtau>*O4=-Kf$w(5yi8uJ?UsxMS4NkbqIsofbM z?CEucjFWy19%}==EYZzo@?^zn0Q91V{h=LIpy8*~uz`n%H>7hSc5O|2peEIBs&=S8 z(snA^WvdSNcY?bNYSJoUTCn4xOmvPg`HN)U6)`ycdcu8Kj_U<AH2dbff=`~@fLese zu6Q$_MXTG2Z>EjWb0PZTS&_uO*~hoCt~|jlCM1tqu&oJ8sG+MmCx#PI(+IQEBUB}n z`XdDGiKYB5He@_F5GJ+dMx*@v{IavNQ8QNJ?(-~m*SFLpszWTS+_H8Y0-w9_(PxO` zRo`b~yereKlP5A6Ezu?CN>xZ40<=G7YKzsPY&ap~j>m#hr=J~P+nx%t$)DslsW+@7 zl>s*V_KfQ69kh3zj!J9-*uiSOQqQlw(+xH~IrH1}&{iSXdZ+`+1F`#7yd*xpZ$~Xg z+YXr*=*mVa=C(#Oo;X<C3je(_g_=mF^h5p#liz=77)Zq)9hh@P;%yIaV*<aVq@<W- z*6A|;oT|GSLi{y3oMWANO5kZm<guy91ygB>#k{-MXB*h39VWMZmgUlPh7>n~`9TH# zZUg>OA1+3NmpZJbsn)-gsaIFPQ3|cTKCByA?rv%&E$>HiHzV+3p)cl?ZatbH>GK50 zO4&@Y%Cp<oi`q=QA(DYKU94QEjv`3X=g2m*lEU&*HB>3dHMT_#+7;k|`-z_rAfhLt zPsA@))R5xi;OOh?6wcT7CY8JKz@(P2hK`l_chL6Wo1n1%cX(p(8jC5AxINSSc|_tI z!b>4$OdUt6yZFf<oG<6&L~e>hn>d8UP9XdEXTxfqX4{e%=GUG1JO-GU|NN3RV@BDY zJQixmJ|pSxHHs@~PDI6nsvp})bBnwbq$1@I;Ea8Ba#1~~_~=D-m@M;JfhH_AD{rD# z18)$hYF@$TtzzyhT_#Zlg?7$FZMRBxEd5#6%;jvx%ZPd2l`BPq%@fm*JNhGVFAqUk z>A~;hig-f)EX^IABCk&B_A9#hiq-lhhu8}Jg38@tv#ND!RD)4@E4)0aUf6o1X}Q?U zs-D-$mGSGvzT4O+C`=02we+<qMc0$JYdasI(S4UclL{QN>7)q{>^`)_ja5ON>7)AA z5P+iLI_KLvc|+x6*d?kxt&K<{y6R=$Uabkc#ui~Jx0Bk0gr243ToDK#tahUmbfOQi zu)61fbEyf6i?bLDN{INJVHi$Sn0{=5GTkA%RkrGT&@`D?yYO_3E_`F3<K8`PCA_(7 z^(PiLz0TP2NQO|7G$%}tvR1S~MIE0$`nPYAc6Vxslto*apNF<1jm7W$FR^TK|25ra z9K5jaR0tUU=SS56)1MRc+uk##aH0sgCx}{nVaT&zbmQX@MNiaoNs6>Yt8MGC9jR_v zy5otw3dac#BhB4P_Z7Z)*BN0!hl)&G|0p@LN=uD06-oP~nW{wzDAsN4R$dAh08A^; zigR3_1VjVU4_IQyUDZ{0zj%+eNQILEcVgVPZ|u(^<q!@!23f(ZjjO+9cWSb9js_Ks zmZpAs4gYB~j)ERmRzlc=ijixE5EK`YpS3Ub$}+lu8(mfM5M_E7rQ~|@Wl8m}#@_1B z%;;40k0+hTdOk}vP1M<(<EcsZG5@DM)Yr`gVj3&U&Z6%@_Agw8fk_XJmE2hwkP)@n zRN-@5h+DS1HnH>dc=YYI$JXk_B)3`)J!n5IAc;NU;?n)s5%Z6kHfKw!Q9KPh7~C3Y za>S?qG<BRjwg1{EVH)+r(?)HN>w2=`yDS3ZyrY!5U8Gq#-derV?&M>}O3h@9eX_t= zqRa<JW8&SZqo!IR$NZYJ*4p{2HX0H+!sCS4XJey_6m4H9uQleKC=Ypb1gv3Aqels% z?IgEB;#wesI7M|Q*AAIaaWiGxBkzf=jmBEDlg|X0EO}*NrG@iA6T_TR{CU0I&D43r zFw<}Sk@<<OQT*VifYxiVmKVa5H9k9>g*u+CW@dZt4j^f&+=gzg?zy#}h)%BUedhUj zRu+@ou;=GUK<~nK=7eHRx{CQhY&H~JVI;2GX4-13IJPUCF6zAdK1Jy0+n8qYjwY4; zQL-W;+oeFp69`V{3}Z;&4AgxPHPAj?1fStv{xMdT)EcJp?VRpU$`00)Q}6M0_wae| zMB$r;mm({l+X(0j=3|JVik_@36A%#{zE19<7Sn3gtn!Ny^0fEhHl2{6!PQB+$$9FM z9j#LvM0UB$lbTHQk;f+8aE;@N(BfX+TUOPk3a(6)d3wl-mg`D|sp;;{y5(-rPc}@= zxAn(`{<4-!2Mr?=<sJ2!=;sKCy=>r+b)l>YUY8-Zclm|J`!?w<hEADs?5t#k@U!;m z`PUb}b2)Y_tSY(c1K5D`Z6}J#i!9c-<8fp{r2xnhX*WH&%VL*p-Xl`+a(L3E(3QLQ z)zsX_<Z(tKr7_%Y`g1;1pM}sxLF%}gml3;Q?N|sOoabjwM7y>-1{nJgN;GqSjSP=- z-A4ndIZ+~OwjcNAP6$oOGz5-PFyY*kIA{CZ-JEzo1{%U?QQuM3?+T~tY?E7HnBU#S zV%<<XqxGaMSN3K<!Pm8*M_m4<I^S|X=zrc)&;_*v;0kWRDZ@Q)>~dAxx|^O3lvsd3 z_yP5s?U+*4$7vkuxmgSBa@{(hM6u{?CR)FcOj|qnaxfFB;z1;6yp%BILc&!yUUJg2 z-CHwfmp|OWw*zLK_%W0Y4F3Mkb$b&?)YNwpjl+d1{Ew8A?cmSF%<Odwpm)tLWY?TM z^ehq}5%2lO@E;(2N$^fgxV9*gd{NPdbiB&r7eLUtV<xKbjR~$UH#=ZEz1@{oC%(7P z)O0J$BQ$+ZBoY*2tMHT4V)u6m+xc1&-|ad%w!2l<vW57@?T=1O6G!CqpeF*9&p5E~ zEugQDgx7~oxcdz0fzr!Y(DxtFIYT5^UWWHQgByV|!@B#NC8!uD97KI5pa;$$lUi(P zk4}i(u>l~mLWKrJ<?TJXN&C)1KJH<j$H&q$-lQr=kN5f@Rq*ZZ_Cu_+Wl>MroeEYg zYO5wHT{|Nt13!#kd}0cC17@J1US=4)<ooPg{RIr~qVAt@1BAaRwRap7>hTlCe6`KZ zv}TC`CMptuVsd({0tS{tT`xXj;^uN8^4|t!ZQ9eA>=&JS(Dr=+hW~kk!pU!H>iWyA z&W(CdA5s|?_GNvs)<LJT4$#I2xTO!L_yf6L64W}`xlxzC#d_J(Kd@NLPcZ6oT+1H8 zrw1Taz9-+l$ojM=QayRp9(r+^FqJ@!N^HQl0q*?64TPJ7^jL>gkrL6bX`RSeR)@K{ z!gN9A2JWSE(t&??DsyxkfR(o(UMiYS?3om#2tyt`G#ffm@Ia?H<k9R0JE@e_seWCQ zU~~r#@S`0ZAlox}bWDM;k~`*o?HyIV4Vuk7LubzG`9E;V&UpvVg60~wKxTnFx@Hy% z;*1(TW&mP0<Ss<a0}QYS@P%joc0GwOHl>H%9hnuk4F)i9mfN2ASoz7vnpK4P^0gzM z>|#&HZGcs{G|Fq)7eIALW~$s|;aIZHR9*YF6R)Husd)8pNz{jzq!;ne33YuB)y62o z0P5`nETnz?hB5t#0`7LVJAjy6tC={TmUr#=(GC_pz)Au9-umwbs2!4NhgIle!SuuX zWPX{@_gUxEwPqQl*-H+3+UdobV{9~8#9-QSY-m?@<)|Y2xb&+-u(E}~s-74;3kt%6 z5kzfGfEWkj@lODw;O`d-Zj!KD2=M8P34AfZ!V|3R|KtWtxlf&q!<ff<CmkQvE=T_% zk^};8gw7cPp6N*m`fRHov9L_2SNeultHuK~DzQqgvM0KfY>vfrVZVkk*X#=p^VDlK z>3j&5)zGs8$=BjOq+%0gdLsa8Y)|=Pu<`KmaAH012t%k)8a*h@mh2Ayp1vIfwL#sR zTC8%E>jia|bj2)ovwzDI9JU8Jb=C2DLc`Y7Y3N_ucLvGi<*NvbKyV^<<i)xLX0x{i z9n7&L9Y?%wGOrLUOnwn(SX<W5Jgk-rBI^MEBRme6k%h+Gu2ZS7g&!cRohv?Iosu@P z{N1%wt_ug=Y!KzVXAglfFsoeXE}EmRZ%-{IUTZCr=&_4<+ACpx21~lON}N*vDkOzM z>?hdhaN^LXrCk-n4|^;(SH_<9F3OR(TNsNbKeSEmPZ0b;s8r_Qw9wXX6nb*Jl)g0r zt(rpE>==V0+wj+Hk3(5h4974}0KhL0({LgSy+HB+j-|_Mh#U0iC{(con<mhzI~$*# z$F!nk9!b%^bn$&vo5rnOH*)e;z%Ko$5#9pR5Iwf~7!zu#%@+oa#sIbr2V8|^E5A!y zvb7W2t(RjGYGme)C6nRgC8x~MQ?@JJxPmdA2-9oesDxIQmMM`YGOx@hwp4z=c^mEK zRrii<8hLavWc-N5ZdyGb>J^vC?{=tbmRiCZdp?CZnzGrf4@*FlOT7{=aq;|<rHM-) zK`}5|nbd74ZmD(tn)fnv)O@&dO)6moH%aIbx_-Ig#Xc)8Oxw}z$UZo{dqvV;QH*A^ z&e8`WHS!6B^@&WCVPcl+RJ6QRe7V4dyQivJW3wzYx+^>S?CcaVb%)-b&<H@CXb_-O zNbP4rHR4sPCS+wckF(<J7<Fq;bp&0HVlJ<a){B{OR9x+=XDd_+XY~Gh<w#RU#lazO zOaU9BO@x}6XrOL!vbJmoPEK&D#@a(Y?T%aE<4%N1r<BhMAHmpdn5kOBfYInxJYyBZ zz63x4(XwfwfW!h`l+<e}@X8W$>G#Hlbu24j<cOp|=s=qRbd74BzhS&4?Q01d*;;H> zW->TgSFc^mMlNl5&fw_2=uW+jn`w>Np~Mzair_Em1j)4yO5BspxSnWeQ@wbAHe^M} zb}xYpLx3L$?D2N)e0TRdJXrpkCSs;%soc20vL-Hi_OiE*X(CmNoAsDAYW)}~&zP;$ zd8M%NbYbD^$MTikBNTs<DuZg3BoNC5GG?BhKp0t-QC_ce2%0!$!sNauUG>U4@C-94 z2>cQfnn=8uaOsmQ#sM=`S2s;HUp<<luB0mn)Xxf4nWLmDGz#D3=Fnsymzw8cN&Ju0 za~`FCo9<&hjxOx$xDwc+J1ehs4Bn*P*2(5AQ)7v@R35Q)sofEN0&8IDjF`IFe!_g& zkP+A$NSV<G%3!}93eJWq^R2s?3+z$cjVc{`SD`bCGqW`caVyRZ{msAd2G_8FW0d?T z<qh*Rp(bK=lHn~YBo5h53<M`*Wcis{{^_R9yOo$%i#V^kQFAO-^2!j@S%!uUrQs)d za;GTfstvKWflBc9_Zy`L7UZud80`c`uez=eR-|a(r%JN+?Y0EEP6l7dsTnUCUq1u+ z62Bsfdk8(qRl=b(2%z&lcQcPuJU>qlk-Gar-27!kC;02}k0#Z%?1o(*3;OKxNd6?Z z;=GeV5T@8}5;C4@g5dB0tW1@Ol<#ivCs!$>aR)%&(&qxLe9Vtg^GYo?Ga5m@oDlHk z1-pDtxRQi&z}EptXMzUaShWGUKK>)}Z-A=?cw5+)?Xz#?)a%>lvGjgUV)1T6Z@WcS zCe>vmO<Kw`qdQ9r-sFOJR3r8wgkADnO51_;-lSL%12xLHFS(gUdB9pmv_)@nSV28H zs|IN)5eLHdK6r9K^rgh56^(LoMwT$L`i9P!>MH6U)byS$y?4cNDM0rdl^*JyTR`Gk zkEbG!I?xxjiMkAJpI@OigdQ#Nfa=AWlVh9uc*Fj+^`L-VniVE~^2y6a&P;pJ!*4yU z^RZ5@UbM~(1huBcsOtHAa6ezV)_A6=Xa&$qgA18IG{^UXd=pLA;uBQc={{t~*)h}+ zmFauW@&6ol7K}1SIN)2FPN?z~)>flGU_C8J1%__b{mG{dKChUc)9K>tgrBdxsnF<# z(YUpH%i~|q3I|<>8u#vbTaR8sdND~4o43nf26a=wi|?<$-Sp@TON=a{4c~UEoivV) zoGG@p&)RnSAi`i{IKI2TU#)6A)?8&-+Y@vAt#Xqbwh(6oUJ(&H6|$P7_H^Ta<$2+` zW28+&&`(eT5J{}1pDQJ!eY-CBr2gz{*Us!P@3XR*;n^xjk;UUfkmT_GjOFMNMU%xf zw%>xgM^R@FcimdmNyb<_>B44d68;y9F`ipv3dN&9J~Mlsy&+Kf?OQJIM%4b#a-9+@ zmYJ!4h0$y=`EZF_{=HhL{ZeL1b#?!xWf1ya7dr+#(hq!*2A0^$!trTJuE^eMYD(u} z1IRzkEH!u%ASO<|h@{epAFe&XQ7r*N@A0CjC`QNw5chRp{V!-O)FDpTlHA^R7Ur3W zgdF(lf_r<!L(^UfKqR&}x-;HFU{D^xDGC}qxV;_TG9<U;UjOck!lU@co=Uz>d9A|; zFj^9u=J&T{+`LHvnN0++2m)Ku1aC=7Ew@a}7L7SI%=tY^l6M>xjrP(^eCegp$zB$% zBYF8Y>19T_$T-MKJETpk6ulM77yov$`rR7K)rHyiS4e%eVHZU;l#@Ih=s}of9(zqS z`a;L>m$5RV-j|@3czXIZs?NV=pTse0uL>~d=92VL?frl;l!z=O4qmyTghwHwYW&L| zQlV;TY}{7=-auWRnDv)-Le{B!({|Immi&QRa%y-X`09+UU!xtZ8TIJS15!@R&}Wdl z;6aK~3Y4)K%oRvccJ24{+V2Sk$_9>DE^8kkNqSfgjnR&;Kf(4b|5PY;HoC5ZC@2Up zuZ|b{(^)Zd-~}UWeWJehR6l9aE7lh#e~G*_>}8U0eQ_~+1h2go<c_iz_}YsDafZlE z!F74j&z%PMd>MXLu00UJ<;j!JnM=I!1h8Kim59n%DzYE-#B9&Z&fVfPfiQx0wT15I z*C0T-m%Z_)ncCAe`2GYL%zBg9+s0`DpMX}mHIxpq-f%Qlnhz+wAh;-CtrZKLeL68z z!)~zLI0}m50`I(ysFN~O$v77)2;>XXHXe6I`1<<JExqkC(|jWQ^-0?HLx}DSnDgxp zpbkOQgt6>Vz#t-^$F^}FftLpK9dRv2#;blu2kWe^X=2kmF+g~f#xKc;SsL01VN7;P zgVqvq&)*ifA3~pGr*o^pjl~I5>Al?XaV!0f`G#}M!H45G|5SR@wRamhUEs}Xm@UA* z$Y6hr8$Vr%o5<A2Nb4Ld>4_LyAO1OE*a^_W2V^BOsyx-Pe4N5(j&Q)o0hxlXes8jH z;Wbzk_VqjP%3kFvVPPQ38e$*BSQS#{Cvc>_;S9Mi#2EYRCKBnX?aci}OYw~o{f<&V zNC3tymyUA>itP(F>(G!3G|R1u2{1amD_B|Wm%h^61)JeXGU>@n>)H4J?Z&V3mE{_? zm-+yo0O}YBBA3251dn|*{J<;WO4)=z11!c{bQMug6S{dnQ=iiY^VxV!#~lEr)z$1b zrYiNL^nrC%pIzDhL|lHH7|Na_co1Sb5>O{KXP*Y;@L+rxCC#63kaQiP^h-+~P+dIn zmy7L1b?jxT#)^mpYJ$^5be2gTpHm!X*Y`Jn>FMdQGQa>0*>!i7+I0j_5dask2{Ex< zR2a4UjJ%kC=~mutMdWRs-n@h@sY(mM$SM984}nEvX=!Ze?SpfM^aX7Z5fMyX@pcyA zz7mZ9!qC1}ZtBN~PKF%3*~j2TdRzX&#*zW)eDKmnddOO$U7j}MmPXWB{=)qzY4FVg z8nqD>X6^j<#{xj#9<VPk1TDa~xg2uEb%7`;{Uu786#2e0OYrXoQaRV|0vVFp{iRQ% z&q>s~gKcg8pw>Yqbbw`p6l>SREosWD#qsI4BLe~gyf!Wo2PjHqtAq^J3#+nU*wG-c zGlQ~F&eM9NzJ9$fZ&#DJt$f~C#P}BiA~Z{{T@?n97h9Wjisf#9Hc$tg<fqw@34plV z%AKLaqe1$Flk>Vs!`<Ewf;d~9lB}J8$Xr~k;6A+lWrC4ypGA<_sd9d$xPQOBa&3_; z-*Rn^_tT-<3m1?7!QPvPQ@ywS-&9IuC`yuwiZY~<nG`aUDKeMLV_4>~43U%}vqfey zWo98XA@dx|GS4!U%+v4uF73UqYhTxWKlkr`p5r<0Kkn~wwD(GDE#KiYoS)D6e!os6 zcU#DKE`f9u9>6NjA%1CgPfqA_3`;NMj50U>zA>At3%;t!GlH3C$K*m^DD<z>M^hyY z=a`O5)|r-0tX)JYU5NnQbaO7`jD_aS9UbrE#Pcl#;#+7SpO?{0LHCQQC1#OH9JkR$ zSBwtDV9wJ;?W|R{=4V~)c}!)LJY9B*4y`6nr%Q|TtFxRL<DRKrS8{oEG?$)JJp&*Y zeCip|mXg<BSYV|b5UuoYl`;rgzQJD09|PD~*Ubdv50cL1IdJgT>guZ($dPqF@5A<Y zxGHh^l$)6sQAq0HFq%?td8P|q+Q&JM_%m;vx_a|wj?hT|`+8y@?;LGC>h=ttlr|#G z>(ZgZoA(}=@a$?lFt3yepa8qS!Hf|iBMMzzkCJ6l9j#-O6ON&pMBUP1aYq}ftKN!+ z6kOI^Z7Am1P)^X<pj3{uZ|)}c%FHE-hngccxEe(2={fDO&21I}XG{DQMQCxwwy#fK zyn&a<hT)Ub6a#tN0-I^D9|_|$g}vY}FM5h7OQH~ofPun?+3{Or%;<O!vhj0qjh9i) zt%#LNkiIt)6@|05V-kDZB04F>I-}U*vv`3nH!wE4%ha0g#xte^T2X{1FIB6@)vl(W zxJ7i}VzEU7VZv=Px$FG21)pG<hHR`q|8!EaLz@hrDQ92=B_%9Ev<FHWQKGFr;lFeW z5gBFntq&l{%~)#9u(OR5A%AO^YX;q_G}JVp)Ek9XH&Jnmz+_&0ds!!vAXqSmeI}0P z{63{VPpibML|PiXse@m6^D+0?Pn+EltP(2E3dLSZTuO249x3RzbwJiou;KK|T*v75 zR@CZE8y+cXNl*d7fZ3}Ji=``_*sAZF6bpTbjtNnxTdam-?X4KnL`4f*=aSBDrx!nQ zIfl6!Q2E7{C3~JSB1oP}Q7MUBE8FFs$+AGrEJ-cr@~v{1rmBc9KQOkh+joU)-wIY% z)zNNMjRAcGdIZ7T2#-D2H;@Mou7sbffJsl$I8Q#g&8feQsPH%o1g!#@5TX%=(!H&6 z)_F2dj{z&}CnKVemBi3TjLik2$gn!~LU-q9L=BxnB6SAbm>(6w-@maW7N}EYD!rX# zZ08_erJMjIaTMdaq8iN)!9qYN)$2d4H7_>c@)GAT%%*K46P-_$w@msL$AO%a|IkY; zNPyd}(OA&gkqe-7v_vrEen#Yek<j<%Pb;KK+C@+uP3v|VUvkR!Y{(cOIaAQPoJreS zFMjnM4RS(ypDd(q^q}_ZSr{Ap?Zc!OMN*3i-E6~0;Z24(wWp_J)H76`Qh>RiH@NlJ zuR=_oMrDd6h`7)pIrd~9&41{<ywN%x4d=ugk3T1=LcPz=cXr&NO$4c{NIjc>pilM{ z6u+5Wf~?u?pb;*L&?N+Z8muXsPs&y`jVUxWQ+)isb>T(9yHKp36lbVjz7iTjloG~k z_yB+`p`0=}Dr5goVdT;B3)+q$0ZI0){PiXt2fQo&rCl-mBDaeuW|31zs=(rs7IQ|Z z4_7o7>iYd3GQl8GzJ;19$$!FqIC|31B5Sdo|7}8+ve0cs8Pr|}46dL<q-L<;^V?xm z6E+KR60_7{|9KLte5J5GDHUhOA;d{SUor@Wt0@tXLEs7vS;4Y9%tx^DM@;16Fp1q? z0uNU%>rCs2<_#9h)kU0`5SQJvwTvF$;ITp#ma<6&kQh-o`n%Nc!xrO0-<0u~Etnph z!FFx?WiTf$Wl~-!fi7@+k`#Hytr+++`pDZ-E&*4MYtC@@uZq{^9{C#TtduA(ublCE za@2;rXmQpGBccAaLQ3;nXQjG}8mB@z>)J+-{O#nkEmvk?^jusYVa?%OxoP77>A3ST zJy9@tcIpV<ciA4gg{cKm{u8)T*n21kvumplw|T|75r@$B_OrdX@NS&j5Od4TcX-A0 zZ9O)fif`LPFcTL^?B#kuXLrn|M7IAd(yGx3xWyj&xfC5&>o`B5W((2`r3_x-Lo{T3 z%qg6qendwCx*3y)Y9dVuBaD0^`eoe|Qq?SuFv0{^usg(|x=fv(#>9y0thb<EUT?o? zti6a=cUAvrd*$A_J%n_?b?AU6eNZ!KTd~kLW?Wn~Nt2ztZ}#)up<5@SjmtE$_eJ8A zYv%$K_wTb4_gHn0jo<$?49FnTi1d8XbpOKxy3dC{(ih2IM$41-bX`K+KCiqf7-#MC zd5dZYHQ!3iMc^8n<xhyz$<H@>TUrR}){Lmj5X0kmV{&%APNJXU=EhXgydHn;o;N|+ zp(RlM$E!tTxJ`|`lU;Q9N~-NwSwyC@IdK2DtFvliS?KFaOgl^Z_?0t6CayQnwJL@x zR4t8+y(Kh2!5Yth9hz#uQKiV|GMCpF+v1iR#0r}}<bI>f5A7OZt1_1Dep4U6Ju=<f zTQA~g&dn+S=pTo3X=Xj^$k=ifQEay^{eZyq!@@<vyBYnfOh6OTIGPkB-=w9%S6<T_ z>fFKVy;$B+Y@umL3EI_ylW8Ak4w{;_2DMCfNyqEuUE-v+7<+ItzGU$D?a*D>?i*6$ zHoY7uwP&o(C-~l;-SiAiJU&u%h5^VO%?`?ukHv%}=Jl?$G<{HW^!cuzfI{oum_R|i zvPk;b(1@8P;LdWU)ZEg8Y}ZeHHtz-PY@8*K7ES*cbW|XwnCt-&;y`6nTatuO!}!Yg zUfW+&ZC#maJ#I9smHvagpI>$zv<rj>o!HBurlGItke(HO4=_^fZ9#XW5_Lq<v@PZW zF{gwsu^R`4-L$!p*r+T>aRDvw;vU{vLe3|ug=7Etm{G-HDk<)@c!Q6|-CsBwe}!hn zNDY~UBVUdT=-{&IC%Q*bZv;-+ei5S~@$o(Qq`kh)f5g>a`L6mAni7P&^k#Zt!9jTH z#RGJ7GzJzOor*b-TK}bWQ)B|wT-pPd5!LGC>l_z*p{97wGpb1cuwtE?$S3w>Qa-}a zw?KDVX1hLk_+=vmjH=E|Y29Fg$WW1<e+O2O3#ggL!Wn@d2crcpRZV^6Fnw*J1~9Vx zBkc5{L=JH-<Yg)0UI#cB)mg&Ep1f`tS7Et0xeK2+H9vK_Ta3EA?qM>-v)q><wzi3; z*D9j6;gl0ao8to!4ym+`ncMXX-LOj#+Xqd#+nGzBnRh*3^vr{`Rf}y`A{jBIUM`Ra zBc2@AV_sl1L7Y~kN28+n&q1;UmVXd6;jNP-k(0C-oWr0SMB^ZgnS!{fP%$S{EqA41 zdMtB{)IemCN2D9gT}Nl{)Bt8H*0AS;7Cw!d{G1sMG?I9s60eVzxuW+;qZxBrw$kck zylN>u!HOxA7Gb4H;l^n+Xc@30w|k2LXE>MqZ2nPjUi2X0!K!j+t>F^XK#vsFhfdKh zp`f9jm#<zmgb6)^#hkpz`&_0lX)u~hPn)RMr=!7N2r8(aR!alyjLw<s-6p>ZdX1S$ zdzR1c_!(L-KVvI6uG1Tv=F6Wwy)<6NI=R&KwQ8SHoYMqAPsUo7CeS}>BQgwgAK{%! zcCgP3+qj4u2XcKR3@~=CB15I-F;D!Y0QmtOV$e6dnJ)f>`<rIz*h6p~E_~ZTg9v!g z!!ZJR%TH3_^}fuiFyXX<_-|W*xhE~;_i2G#OvDc&74>$PV0hC9mz}wOWp^RS(<IY) zEdNZ}-6~yLgZ&J*wmZ1)NBJ{N+iP(51!d;EDfbo^#*3C<V2q6i(!gtuttU^Oc+#DA zQXYLNJ%&rnvNN79zGxZFrk=PWOYPLxU%Zfa2lf*lSl_*;i8&9LP>m<Yrm5}N&%p8_ z+<@q`;^WbeRW3<$=7k+IXCxQRx=K1kzWLUDG{;=D!=8AjPNgP7rDjOgQ+2~kfNv`^ z8td^cwIw2k^J(05gjjh_9UL@n(6^Bx$bUvP>;8}6SFWFO$>oXL1_yWH^%&x|ALsCx zu8>y!#(w|Z8;xvT?JHJ~pUxh{aD>fPe7o0vI7OP(*FtopuEF2zvJ%Q>q;S?O9P}4c zmMZNG{3`7UcM8fva`yLwGy2pq4*K-ufSy=X2xni}=VeAhzIko52c?AsGxJN%^Jk36 zkkMuYS)LoHHy?{n#Xi*!Wo<r?o9KC>t=|89EynOV?Fl=imwT@HJ(2_fT$uFrw4_LK zIe@z6T>toBZK}22Ul|r*gTkQFm%|ST1dB%3$fiG)^8}6CM1?uG9Vq*CWLPnIYV1OY z0V<XY`IUY#Yrd#(gZldE_URjZ^dCUS8q;*_%$?`*@|Vik`1j6B@6vk)w1N#k-}G@x zaenlfGiP$TOg~Y!C4`3hc_UCgf)TcE<Q*7uZW$r^59j!12>AJ|C8zZCpnXE>>Jio+ zSpp^F)6ZZYMgG$IH|J}M@MK6!wuTuJuYew+DU3$P7H@rhgGJIw_yYp@b7EVf{0!a( z>M2Sl5C>Z460~08<5s84#_Ospnz(SD?_fc#t2?;12I$x8xDy1DFMj;^0X)5tvdELY zg_ioE)8ZFuL5hAt<Oo>UV0#Im3kTbU^2<Gov7jRvxcU6XQGaOZM@87CE9)T0us}95 z44c5jUoCKqs@>l~W=6JTA3=zBZqt2+TPFC?2<Qzoi5=LgMif;m+HaB)4M|$q4;cvd z{KG_j%i#~G!M}qdNh!&6$*(UbXZWHLLYePQH+9GhG^xlKGVCd}34$)Ap-(j&Z7*tx z-}%M7+~c$~5oKD>_8LrT%Wy8$20;A)_(Sp1Yz3!0&$mc1$Jx2iqP}|g8Vh+A0}T7& z#k7)vV<N5E;58dYXLXG^^i11MMqA4CS1tqJq-uMTZjf^=E_ZUS<ZhgxlTC9hUm9Bc znixGmP$sD#G{<(D*k10+7|j;0WcTKsE9A@Ib8R7I__Q)tqKg?*npAW7>OLN(=1rNX zD&X|9<#H|HM3lA~PNu77)@vCz$2z<(ZN9nr9l~uRuaD0tCT;jQ0|TOX23?h#S`+I! z)~c8YJef?w7Og7QnXA|n_?>>TcgRT|+w-Tdb(?m^{JPJJT;YZUTc_Pm{kgFiS2v%Z z-I`h&3y$OCYMr`+n{lgguGpk#`OV!-;vrY-%1NOG#j_sk;}L_7qhwU<(Sd8E6i9O= zkPJbH9ox(Owk=#hKS;7m?e6lUMzj<7e{|bi$*78R7;tJfmMheQJAqI@HtK+gqO2Sk z<%95ljN1`<3ZPLOeope)PBg{~51PBc(rX!@C2vJz2X=Q>>CBH{8i76HVsv_h)+n~7 z@t1LDuKRK(jFHP;XSd#L*}*r<gtIvTgT$K>ZgXluFMwE@V^L&$AnfcaT|$P_VILaM ztjc=HJ%6>k)w27-d5nV;ADa+(datjyc<jvT?)I%OBj^>-cajyS37iHKC56YD*U~IN zOB{CAC+W;v%tiSw?Ooi-v={Ol2sowBEH}zj*1>VEsHYP)@2yvQjbcn2zwhZ$2ndc% zz&Q@78=+Hf{sfvxL9AH^LWY5tnXh62)S_|hdHNp^moatNV%eCDfolLRMU1H|p#MIQ z&Y;{sbX86f{Lz91Nl&oxuJ+q@N$7ZNFAr`_@TB&IYE`W_S;Zb7PPT0kasXVHZ#1XN zA73j5M4aLNGUNkJ<Hv0<zVMMMCun4y5);j68Cvvj@*Baq-~nLbXg{tY>hsF2`1utg z-KhAS9F1)Itu<?QfMfy=hiJiNaP!3tlkhz~GX7@3Qv`6$Vu0?Jh|A&&&gQk&XE&4C zuMpOn0fGju)NBdVi#tH~Sb-b%E|}>LDAENihx?BJ;OW$CufewtRILS6wWj98jO)1m z_`JLJYIhCZKjc1>yStGq44@Xf_fkQB;Ce=!x;v!)BLGi<!A36h{vAAg2ReGUrWLfX zs?Al8U5Bc3ey8|<Q*r_O^<Sg@OdYG*-ClTgR(NM5fCGkIJp|1IQK5c{AMYPr3GH!& zz4q#?2ebqj4!aTb3?j<B|E?M(`WKtaN5>8F74RgAfLSye$OGYqIAD-gYhd9;D*p!y z3eyXb^3GS*Ts%Js{)03M1gEsUf=#9CVn{>V_f<G8A^a2=!s;F80h;4EEnY+SyYmq| z_-3wdb8_$T__6<PI|S8uu@Cao|52~rfpHr~lecgwmB3DE&IOJVrsm6q5B2aAVHE8% zNkbq#7E+!7XObbl?cf8BC$-G30ryP{hEM%vTf)Qq(5o(QN)rc5r%dHx##Xmr+?;4k zIGOkoK9Z@Ld6)t5AcyhK6?Vv$T$*2tqqtj%fp@MMU8^Vo{0W|e^Ygk{t_rmP5kccY z`0()AqsW~qPyiIqmd_r<jo|7ZBsF-&Pzz%%JO~pTXKDs9`10Atbwlia0^{7l-C^O~ zwOZj8A<R$~(L~=<%cb(coo3q0_8Zf-gG=vMBP}Q}Ul1(Rv)#43lRc}@yo_S4eQQ?U z!~oFhL)2R?#zugs7`H?A&I!nH>r24E+WIkWlGH><P}uvo=(E)zTz9R$M`t^Gix0j1 zX~N3&v$1Wf5XADmg@&0+vtAT8k#<gJPJw}Zm<*7rT6U?gGw+Sl1g$=QK*Rpdy`KvD zf0I8diHeOqjXxl_M2xiTTeY%?>wVce=H$iK_0nN=mj#iDi3YK|KiK_62zxxpc8|9= z`BC^R95-+z#0;ObIv}olP<-;8Gf*p7v(6w$7!ZVbQ|>pxuu{V}ATh`6oThWb7mk$` zb}nQo19tC=z|?^t8$`K*FJDGO5P?61XD@<CZy+xFJxBBns-x3$y4^^`!P<(>*=)a; z8sj;C<0^6hgv0R=B09z&AOEAf^!I<gkNb`NIa`vcc0Y@a_A988M2Io02%IXv{QfZ- z3m&*$!;LqTdLO^UK#+->I5g-kNo-x89{~HS`WW^pZdiGV_yq?;9*z*<52C()vk;Id znRD&W!S8}wV)-x@X}^u#`YF;542-Z@g)Bg}JAlRmo+hwGZq&Sd1`(FFv5k!l3OPC< z@H>G2x<=1C{>uB)iOqhm0jK9Q`(Q`)=X(QWb5)FxK9Li-{=>f!H`nu>=#z*)V1H|j z@f)#mPj29RB!8`NY`O~{rpGgKe&4!Vk~C-kU|0QfkS59l#AV>lhe<>B5r5xqaYlxc zAI4{{9m}-1ag_L<dWJV*q%p96L&6NEDUL!5fyT_j64G@Y*yJhgwHp+2rTA71xf0xV z5V)v%v&t{pj4p0298u?wkWU*8ZS|=t?6bS~W@)T8E#dBI<SRejUX;tUz#`rr6UD=Q zX5Buhf5F+jmcF=b-H<;v8YL_4BE=MHt&<{jaj<Ut*rX5eY?Ly_Y|4fV8yq;ON#?cd zg$`6{g-Ge*i8kx5114T7NX-UF_(jC?udf$cV0*8tZftBo4pQWdqmitvEa-|E^|8f` zKBMdYx0Jwc1;{4+1B695(g_%^1M^`Aw!HD+7hpnxo?4PS>e(6*^;^%z1HsXD3~13W zI(=SHe!)_Q8`2Av$to;F6<?99+c77%2<eNE@o6CGozS|X1}KKRDomjnU8WCLhCbOg zlfn``^p5mTwW$5!4dW>O?DrQe*&1N?Dcs=mVu0wJ&OOLE@hoAP8hM!b>-{H3l#Bhx zPL*3s0%7X+vICGzAaWPl+lT*uz)jUpxAJG7tA7QA8X-3R2P?j)kQ08La#P-vz$R*L zdH8(j0@%@4O@#v5Mnr@m!x%8B&pTC?*GRa2iJKjvqFZTekdqb^qF`PCoaU`M&1R@B zk*>Lqj8x*r7uuaDJimhOs~+sK@Q2;&xX1S!Ib&3yJ{hR*PCdbnU%teL%E7JoZe%=~ z|7>>tLYQ>hoV7xR%Ge>f?MsAmf3#=N%tu>_guq&gEZJoW=v>e(dCV(%ks$%mxu?bu zoU<vY49Q`DbV>i)Y17N>IhJ|`1V${iSA*^UPM!Jugg;zRe;6SS5D<X6*{m8=8ODNh zN+zr8|3sS^zDsq~%f;^szab>`QB;qgr#`!iXT4A!WYE5!&ypSHEnlja3l7*THm7}0 zn`@T5kk<_38vAy?r1;1{tWaO%=+(8e$Zzpf_X)c4y7&SrM3v!U=8}kwZ*4c3S0S!A zWUcPAPK^8d_hB8j(&ILYL)`OwJ?s-LgEoD!MaeY?>%iTd9eHKT%)nX;=idSpX$LHd z7A>Z70QqVHN-h!1IhzzeZYKD4SsI&lVxtG<%neh?Ls8QJX+StK1()yS-8V<6BXuNl zV$pRtZhpr5W|4i#zV<1V75+kE()JO36@Z9IZ9&tR)+T}_IVYCJbem1!Yq5nL-{{av zX^YF+LB}(}w!APpjJbP~gS7+UePv4?K{^Kz{eCmkT9DmpnAi}hG(4U&`6neS_nZEb z?^xolNWi8`*6;6pnKw;0b7G+hR}!0@M%E@(<nTggBz-pixv~VCsaaX^s$0<_W1~8Z zM0h|8^lHzhSVI9w5{uamV2g!r-n=4kHMG8NQ4vkbu~ja>4E(7&2_`0{4{UOo^tm0O zeNh(aLiw4W%cqlFgq!R#3b3m7j9R#P%LHAAb*^f+rUMHS7XC}L)`>=Q`-tBccQMr~ z=f*j#Kd2^Ja1xvUXwoBbfBReWPb>M_f1Tc?JRuhWstB0i#JO`hap4Cfn{-*c5kwgR zk+YB@z~>0<K&i?g23z~7*kcZKmhA2>lU~ia!Y<P$UV&2HkPby`@*s+K+a1<M-qOIa zHD;E(qq-B@?5?-O^Iorz@9^VsACNa&Vk-?0s1x>>8Q5dYH43`cmun84t=O2!6e{VQ zXHuD=_a4knS;S}_F1TAo?^|JU*pXBM7bDlgU8+0olzFmWe%ZcD8Fu#0oFRAOINiGL zZef{zEwQHjGq~Oaw|~X&UVbr>6C}-V70iVCa4$xcXU_h_Q{zOoE5Mlv|4+Cw`el0A zstuKp+2X5&(e-1GDBt<lLh;M8Nh;7vz#;nu=m|<07hXvU8Q4*<@XME|u^cb*kuSZh zg(u6nhrxz))lJCv6RM>y(JHpQuORrqM#9!o{@v7xt=UO!6}G`~X*bvW`k!@tB4I~b zI|d{_nF0~7DHBy#e0tf&ZtkDI=e1Dvsk{Uue_Lu!d*0!OUG3Y{10p8L&Z63IQaA4Y ziGvCLs;SMK{}>~K6%{oRJw(~=<j0>tmE%7EZRaUCh~vWNGGa>W+36pSgiP=%`jy`U zG=5ZY_l0|p)iPgsZ;kN^u)*AtyWfbeDq^!q_)>)R)wqs~vdHeqKH-C=4(j1uJw222 zhkjTxssnj@fz6`8leJ^oXq78t`Cj!CVg|w8=f_UB9XuN>yIq*Z`26h?@j17#`M%jh zmy<{{RyJzjRR$?3lH&S%;zy2_J`r(Zt<@J$D2Vsy^ktg`#1L7rEtq{B&#zEhZu|+x z<Z6plDn1j3ESNJG(rhDee1U~*W{~K2duzGWNy0nnxCehRtxT}r{Qh1$@SePcWZLX4 z;7>J39FUm!`O`-CrKR1{`(g#nOu0PfjN=}JMWz2te@UD9hR$90@r8Nbap(5j56{JG zjP8fg$=i?K*hd(s&yVYkm*@bPZQQ1ryr^l*v^AErI=7H>tzc9(=H$D#{@q9B_K%zn zwl{>SI70<${wQj%3A-W@T;x5C_>V$cH_@9M!|~(mIK#J>LpHgamU7;Hbja!(TkC5p zd&TqMb&RUjR1{x&;lN{YMrZCu+h=6#)3ldGFv|Kcv55FP`^AQr5_y{qN4%iSD=YqR zqsw3W!>%G@BkhkD&pHQINXf=WYvMh)j*;xJN?`eL$6HI_(fvEtOD4skyYJHH6Xpv^ zZRogOg({*4ui-b}m`%7y$i}_;(C#>=Sg|w~#|!E|+dpOYQ@rJQ5#o$JIWZw$E|aZE zJzimjvjD!rJoWtS&40^xSzszsVQG{mbtlq#cp@N3_&}|=-TAGlx_OfKM@~s6?IYp* zc-HWi(zJs{*OT@dfu80&IEqKNw9g>@;3P;gpIUKWY|(F8ip#W2@2n>|<4}MDBeR5{ z%(cP~JgXK-o2mE9$xu8@vO2x%2{v&6n;AZHf@XS1pPj^-<{`<r7tO>{|22@XxT}b` zx0Pm`^&EO|=Y;4TZHjSMW927>9KkQ$UZ5M!#kU+PQE<Cw^#1j>A=RRw?o9Z@#A@F! z0uqWD_-)Z^)S(JeYp>gUDm)wpWx6XD;Ni;4GaFsV-Mn&**81cZSRnY(;LA^N@2F3p z6dUhCC<dJ_cJ*h4<=7z(S6XeR(CNlynpOrV#bv30tN)6<8Y^#>u9x>HBWP(mre6Pu z>Jne!{<~=%aik{YYRQ(r&W0o;Ff2P4Tf}JaAvdJp`RQjgSBY<wfn=ylv*PNgk`E;) zj)oS&aa;6Wy0uzNgr5RNqepz`@Y|(>cfu_%CW=WK+BVy8*3i|m)T@5nbae0Fi<)R{ zRXDA2$!J?;mZV`kU(5cZZIP-(WX8BkUs$f27c&1>n=3&4c3fuMB4We5&YYDo`}Gh+ zG3gGa78w)44LJVUk3fT3W;e&a)NBfRKAbbaUwkR|M5%FG*j6&|Yf`=ZoAn>!>5}}R zsQ_%0r1=u4B5N4%iz?oVs_4r%C?*C<N0u;CHJ)Sj2@U^Wz&1~YHjIzSDnQpPQv|~O zyi*i^R@k@OE*9m-ju;)U(a4@iz5>D1XOTzuIFlKYm%8#|<z@IgiM8C@{~dT`gfnvR zu3(iIXdnmIb|q=>i!e`h`M}9zH)1=8@jMkfi5*%oNt$F%xZ|dLDN6F}N}`{|=3lU3 zKBz6TSY3<*Q$gy>j2A`^Cbgbf%gm15cp0atU8Y5TB_rVOy>qSCfd7e%kH1%x;C`wE z+tZ;vHQ--Llfs9iO_0**VJ{g#Xlx?Cp(@}cx{jlaOuN?t>8K0cU%mfRu1c_<8QIHA zB-W{Op=wU7<<IY3F)WtnJyOs?=1B4ztHQ`+bYLHBrH^?a%EVvy0Ah@gW#p}P^C77O zSk)_t@a7nQx@d1Q)D1g*+a1bt@s}1Ye@_I?)6DEG`7|<PbY$c^3r!s<x#VMg0~)!N zA-V@<&d+=})ucUc8sYomD6Yv*vhM5qzfo57Ks@&sWd$D~9)rlt;$~6g1yuHnGh|DB z6Z7RoH#m<TMxRS2y*v{ue<eL@zW;KP*~s~{Il00U-E|!M{@X;AC?jNmuKs?(s<BH# zL;YcXWuIgG{C@+Y!qca?c_rP&aNOwqqw~8(zac8_UJ}u!M6uV)Mv9g)H8v)B_CGrL z0-prBDo`dPHKI(@{S~bd>6zQcmjSmly!9WDDbW7PAr5bl%rt~DdU9i^pHur1(^Jig zPO+?p1}i6fTxxx<Iu^A&du=^x;qk+W{{)#*74>PdMGP5ATBYdY>e3bC98Blwgj$~R z&ZTy7ERgE5fKp(KIhx{@WKX1|EkC=+O02K2V0N_BfiL`)Hv39#%D%sxEP55VSpZ^u zmaw1L?4<%1VhbqZreTaNj-73*baN>a{RkC^Tx2b5^X{X>iya&Z=~ss*K?_tHJ3k}q zd(uXcc<w`YPWlj<S2y=zvwN26Ipp3`dN9ZkGHSfKmW=s)P25mMjmu$kT0MQNEYq__ zzi({$;XFxGT+;(@R2s3ZLAhuq+!pB+ja?PvguAnLqCnlCpLMjc$-8<xK_iKolTCd& z)$%+~9VD(9N23D9m)2qD4Oyo{r9`-k*HcZboZe2>+c3Wq@dLpVC`}gtck^>I?=x}H z;rW8gug`?W&I6Ffj`LGcuxj<H+AWc}U+;`$I`<#`=D*~4dP0<Ir{xUh<KN9*W;lcY zBI4vfLBuwaXk40z8)75x6nGu9O!YG88P)qEAKW@7-Lp5l+igGP0OMrx+E+yfyV}OL zR`k=~=dxqw7_@CCzt|Re`_6Q3@kUvfe|@8$pG0;Q_4KMb(?n1vVcjL^D{doEFG@dh zF@Fi6RZUQ6y=iW<e|Uq^c|tEgcqZdAHHmw$vP(ePR$r3I70z~3%NXM_Bl*b_@66?} zRWG)_5DDnMi97zd?+L(V(t?ybXDycfMz&TdZ$%eohK9|Ia$l3a1dDPizJKAIH2ovz zWcBZylgfJ`M_84YBHm6X_i3jG{0X3KeZun=l>Ojs1^Iu7i~aYRYHUp@2n00lPJ!2P zwY5?h1A6{5m{o}%y<ygI4uj1?FDtH@Op)W3bXwdvPS|h?kB*MYbG;a>PV$u|6B$U{ zD1xk6a*>!z!oQ`Q#5G0s%Qp`M$#=V&P+*?%IKOXQNPIo4h7&sQI47ICIoL{F6}E7x zN#YF5Ozyxw%MYy-rNeRF?Hr9ExnU9s+N<vM!RDIPE_?zgV7c}&4_Yzu&1oHKY^6yR z3S>fQY-hi2nT(ON!)0WC$&JTZESh<8`&P|+TJl}FZ}|S_BiQ3Xg58r_8`1)l>pq#P z3f(xbO~1bK@PHMaKC$eFEJ}nWgqcPl6ZmTr5<%xmu7dqCg_DQBR}0V?X1QmxUN0!} zToEqLtz1SS_!9`d5fclp9Tz#3{xD_bb%!?EM5?`h9>8U&gn{=dCFWZrFu$ChE>~F< zcKT>!g#lMQ_<6;Xt-SA-{YBnws@wt#zV7CS;=oJ^2b{Y{0K3U(j^2l*d2EIltaIG0 zLnco)dh)duW<393&g3+Z_?0eCg#{3e-+81}EZ=aqFHNXKL;26aKDYAho8vbhJCsiJ z#|d<#3Ou#S-2YX%@QLO8rf$`*TVJ@~>A`GnOE($UnR^c%3>i{G&DMv6JyBoW@;_Fo zn&u3!#59qUk=@}LRL{Hr`f#3085M(=uCDGHcP2gffcl%lJ!}_Uzve!a&wQ$TZ&j;q zr2H?2iJL`TQmg%l&eK}uTXQjIv&+3HfrYv7YhCW>RM6ml1|Hi1H%CWDkYbzp8o$ep zU6>83W&B`gt|?yUE9hl?GSKYnKd+q)Qi;Si!9L@btn|KC!##?Uk;@}v!x0{c48>sE z%0c6W#G)ES;2-tZsME!amF<4$=9ql{bG`Wc*~P+tw!EAqW<V%E^o<t%KfCC7vbfsg zRN1N)Lvt%PG}mqUH~pIi*PgdcpbDPNZwCv=Ayz5G=;>Adym>eZf(mmxcellwLr?%? z29x`9(_^O@tx=tw;><(CbOV5o3_MU~@U%_l`=UXHbO%-j&OjjJtqv%x^~*JEmU}vb zaq7JCXUt}<i@=G;0=gxvGb?Fne=VgM@5b0cmgkK&LOtodS7JP>X>e<Te1VZ<x}S_v zhXGwKWML9gxzPG+=i|JiJY%wGC-JmiB3Si3_#W5gTD}Y-k3)Jem}@qVmvg<&Dza9i zZ`G1y?qV0cCSw*SFr#E9H})n)-0XJW=Yp9##BE{`rhgGmK1BQr;iPQ;b*Y9rZ%SaA z1XSFnAB)-`jI{WTc7Xlo!%M)cfc83CoGDf}Zj&cHe#0)t2q|@H&FjrwoLr_|uU_@E z#D8*{2_XSo<Eq?!TwMU9pwZ7R2G4s8xD2i*AlBx=aT#yQ=7;+t?*XF~U<bRi`3>Hb zgX?K%u)lW%iuBjtx0uWB(Q)h@Jj`&Wbawdd_Uj+ogv9tVHWxy`6L#rK$(JYae^>{j zb)i`d^Ua*N<uD7@;~C0P7l`q|3F{wAUIQ=$MEy9lwE=L19Rxo@lLL2&KqR^nsS4#8 z8RNL+InDIxP6XKmQEMzPTRP-!{nXm<4?wW_un+>B?Z2c>gcrv+5tbnk%Wds^fEw}@ zx<6zpV6O%th6oh@qsVj?8WMup=H%n!<3$c*gam+SAW)6T+8~xalr{5Pn^chjNd<B4 zC`$>E$94?-fIji2r2Eu_eER%onOT)ShJH#sG&B@8=N*u;Ev*duRa-o}Y<=#C(0m0i zu8aFxsP0xS&2JDml}OO8?}c(w?e8%5BS?XzOs;QLf;g8^-=g1s^{??sj)QxnmL6~q zFWJ9LM|`LV{b$K+&(*c|mhL9<*bmJ+-`_ER=k(i1v&!gYajWDMJv|q~#Rt-fCpj$x z4KcH0us3Q#d|ecJ>-dMGpRL?Es>U_r=mf9_7K%wvJR6@ub}U4rdq0moPw(@B$L+H4 znhKl2S&sH-zE~qrcyM!JdsrKgAN{+dH~=8R#upCk*^{c%3`5{k$Jwseiqh}rm%sAp zt<e3VcrecL<)QiMv7nBjs`v;7Gce~BOj>e*olghcl8^6r_Z5iv&Avz7Tg>sCID$VS zEr|wdVj&lEuGdC)-8IGaE}X3w537)?1b8<EIxY~11NQ@d+cx;-c6w#*T7fCk?A_qf zMeq%l3@QUa3HVkU;LX42jkw+IN>JBJVRX62(;auG|6h<Hb9q;mUF*S`?|v6pN;_bV zdk<6-5kDX*rI)9K)@a*(H0lK42w~$f#AAI<=P&q9T$yHy>nqpp%fVD405u>j*fBwH zCo$uMk~zp2u<itw#<v;0U~X*S)I*xT``uobw0=$J_X{+;w>b>v;783~?ztkef-X~& z%`M=_Q<ZB%et+CiEI9kugzgo^+gSY*DTHv!a*q)5OZ)wWM#G2sf&@}3;9}*cKx^K* zKOx^6>t^~2@no+Wn)0R#O_d3ajKW+te>?T9=etI9KHp~N8Fd&*=bJhoE4Z}Uy8Nba z5O`XS(tfM2GuJf`cc}ntoeRJ}I!bmi-}8eOE)HHyJkCOSw53{J+z=r>%>97-v&fce zn`T9$v*N(;U*v-n7|T0$^}{{b8X)~KOx~GOv3M$*dLA0_C{J#z(KWPy+Y(zel@6B! zAT6p^_g$si>q!uKK@f^0a=_OPuOvLyD<G%y1zu`t5AV(IAHbz+3%*ELhqdRhRQb>s z{%X?pVXj}aG@Pjh|K0pI_cT}Ks#TFj3oW4yh^v&&Dn{7fTWA0NCdF!=MNo=#VJv2& zvWMdRb}RA+MuwZ&2cvm@148!j6`n6?Az1^o)K}O~ml9rBV%Gz9m!Bf10zXSimLG<E z6G~0(=V?FKN`Ajg$K$%fG^qv!o+iT|PQ(Cj%E=Oe6~Uk#8+FsMc4Yx-5LDmqNnXT% z0C*t~b8!Flfy}<U@BuKvcHw~@zuhTLH;D@M{wddp$Ea|E;|_tGGhPFE*zS0ehXv-i z|21&-n-bu<!tkIF=+S?HLZDA92LR9`d}W2Byuyn1k#^{ZTxxk@MQC#rZg!ahgRxn! z$^jYG8MySc7IS5~(irxyYXS+QA!oUlw#6eo;GX>9Bw5iBrHn=v_&XcB<;YzD-P!=` zs2{ld5Kc`W(Y*I@{KX8xLq(R5bHMMP??$XPQoXQ~dpVS%f)mx{vXiK|x!hr-|Ler* zNupPF$3+mD2|!Jtyw=>og9lgeFejZ3vL|@@0qiM@R%0kbxPNsuaZZ#&VCPK<11X>i z@Ly2R_9`z&>gc;G-pOQcc1thTxpU4O7ZFr;%P|$cm{c;vd)Lq@YnG(Z2I*b!U7w)W zvht?EBO@9^xz}p|848&KXAZib*Xd%Tkoo~r7J2jZ+ZAO(KD2KB%I`rKS8eqvGkTmb zvS}}kje3R5cL>Ad-IGcOmIv<qRlnq744dIy;o}UKk-iRt;X!kSqWHfNL<kIj%Sp6T zpf?V}fJ=DM?z;36^*h&3G`qtsq0jLEAKq6N&xg#8#E}A4jqFlJsA6=Q=lVoWnMv5j zi8yPW#gL-}h7^0c6kICuMZkJ`8WFPSk}x^i{h(^)wNWhd{p*bN$XpEAxYHG{y$4B! z&MxF#>5BXLVo7Z1&oAKD|I<DK>gL_$1{xrFZUaDTk8~LwT~)Z&v;ga(B7+hwZ7`Q$ z5S-S0{e)%zyKdX5Oyu&_;@aW_oZ*p&Z}rGLJ;mZpIr4kygHU`BTv^2g8{_hSNcCvW z6zB%qZ1CB+%WgosgYhF8;d&&vcu-&6I$>Nlh8)S$s(<ip{veAj(LW;Uf*h;E;OZ_I zueb|V>4DzCCR}`XCmJ}F0QD#yLqWtqk)w5Yk@|~%3Jcz^;SMm4(;I!Q?t(I7f@x7p zFP7#@x0{Ey2LVJfwx-VDUFq<4KDxl~wb3u8XYaWS&$b-nYXSDfB(N_qCQaStJGHwV zCUXD_7%X*u9ZQ}a^LMERL5P9#?!%Va?C}U6r*)th^z}v`45V4G6OL@||F#}Zfcwfh zXVY?)M({|tHsd5uQ|zE$4}s0P+T3lI1?pwgL8qMlnK<DqjwiFq#hN*sZEKyNm@Tf< zxv-38pI_PqA5^i0Q9*AI9Syz~g|iw`2S%WM!tNF`#aE|XjMY(h83J?<{h+({BQS#j z0Dw2&NqbukpY!s`*0o%?S&(UI{q~y(Nm->X>RbOn|BPV(Byzjqok8;^mb(RCkup9| zCg|KTcaw<l)Go4~Lh2T)Xg-J7Rb+<V`FPtJTXmY3daIidfG5%J?E8R$gDhz0Ym06) zS@H8d3m;j(V>Qpq(Qc<551w^EhmAaX@Zb&N+UG*L_hAww1B#dOk{1}TguyPxj29kh z&CDY?Gj_Yal|8-%wxaDV4p+Y(9v%j~@D9i6TUlczt_|A{ZfS+BciPMa1GWWP>d}Sf z18VN(@h&k#ZqCp7Lpoqh=`{b<{N6cAO#nW91cET&K~Xm3fcZE*%aUQOLp7`7J*obb zS#rKlzh`c>+!kX*v8}b;wRh%3VphO-`ow>H7!X3uwt`kv)|zw<MAdiqiST2Kl`}e_ z5>U2k0UN91qbOR~G=?<@YzP-=q$nx!NTzl8H%#-}D Dia)7I&|G83iikgf9pxFG z=q7V)`0aPozNi9jdmGX6d^Q>PM;BIesUy+Q<q*)@q%zPj)`jjluaURR!Pdgps3Ded z7vG%wCv4)c_bj<40hT5mO9L2(7D2_EDe`=i>U2|4bZ?s$=_pvw*V(fR5nez?m)ENg z#HQz9cBPESc^SkaFV6svp_KDemsM6~@)svUbse0=(}2z2%M8Kz{5@tO{5@;4`?0cZ z37~0qi#W_}6EQ#wnZs<mPxk%`3`2bKUtt)-X@zE^IjsgtPf7_*Iw7@?{;_xt#gDGn zhh!rz5O%T3w7R*&dD6k?>ZI@NjIsPib9Rq^0go6o(wH{5Uz&<wm>a19D8|%)F*oSJ zXO{fne%ZqPf_$byK`)9Qisv2sMdYW>%H?6Qco2{mUx7voF(ocT3&J;}<Z<)wY6k!q zlMVNaSKvq;&YKc$Y}i>W@-tY%cxkdaD1F-)2NoQM_f)y7ze~(;KiT}DS^KAb$JT8j zK&|s%_w2165V+W*-+HDBD0;KuZBn{h&+PJTn&38GV6MPIEKs8!_x05G9@CMm8uu1e zw1@n*kKX6<L#!U;9-fb3rUzzNr@4F|;2)vP4XQ~lQ@>1o@rJF<hjO^ZXASF#>uul7 zaq(o^2s=VT*}FGE@cw$iAR*L{pHj7n&2nCPsrrdO8DO}SjItttni&tw6llt0%LId= zS|hZ=D-mW6kIuK%dqGngz(h!=43VH0HMs-tIv%2IvGe@lD1TA--55&JTa}3vXbY;~ zz{v6poxTUR3NWR-)At~_jc`(b7Zk(Qr(w2k04qmeVl^zvn`w5G6Ac9&4a-Wvszq5R zvYNf~aKiA&ro?>o$IQH!K2Za-kGIG?2!uwyCM+N<+^v4V*g#%(w@_q4I^TCbBmU;- zn4C~|N`(8!h%eI%+n*Vj+z{vx7tfD9EoeEjQPzlXXL4a$GdjHj2~^_2p$8WI$B{I0 zOl0KHe7@ZevgufFdAh!7Bd8HGq)z*w{&vzs#;VAvuU;e*D&!#N;<3G{^~)C3_pRQd zKP;>IV%kUZA-;JoCZPSPL--y=u+P|!%A~$AGePo-<Fgv*2aZW5eZa*n9uO?_2^Fos zpS{d=TI8T6;51u}57=9`5Wy?TX%;F;sLKO{+1Ic6FLWLs<eMIoGl5FsPR!~&`CwhO zkuTxCk)te2ud>GWX6|welspnl3>s#O1_+S8aA?P!v_N!!j&?nXR&Zl4U-?J<6ujx+ zj$SJ=ivmU)L}vingnY%*D=45sA!}C6Cysd*<?Nr$V8sbX_QIj;029ecHnEe}DOA;t zPtDE-B+JqFgdMsUVf%%>fl>ImALj#g;-iM2^=~A=uNznzcl`wZYR&%R(O1DB4PuDe z;<+0L3T_Y274kX=j0S*<*z*D3;B;FA$lieUCK@@7LYUkDViD3&Gkm1fqzxwBA>?>4 zTP`v?a<ZN8;k&H0>a5RtN9n60YsrltF_##q{mcZQdV%qEh^~SZw%RILKU>T5Gan(> zy?H{=K~uzT{5Ij$=5^Di$t5>8w*y7!!rw*l?(-#pG6hK#fr+Yj51q*k8+y#$k3FU3 zCkDc`$&MVUJFao`P+n%TGOIwT-2u;H<th!}^O6-K+L~KQeUYsUmU=Yp;MQH=|45tg z9Ah($`OSDEA}D%DaPG3?6h^Z`1mhbnEqS@q>;_ssP5$^hDr_+@Nif#jF&4&gsrc&W z=&&neH$ev1ubJdfMMkgk^;x-3T#U8RnK*lNU4WPlb05*DgAzEF3!(;Te+Y0jkB^m= z<lJU<2-zT#xrWF<DLnv+FHc4ieU9qLa(G&Te)V6#h1lzutDPyrT@5$JLf*_sUy@g> zhh7W0&xi)_G=|oL2*Wet)vH$m3$qm&;ikZc0t=dZzoqD*i?xA0+U*w+T^#Thh+Fv5 zX6zVq@2ywO?4$YjoQ30oP_^X`&O!v)!L(2TSs`#mpS7X4fYT%prv^nw^#$O*-*sxf zhRlAtCu7|Ye7`-x3=D{eIxnw+A=BiC1rGaYnNcA(>}pAO^Ro6O>dx{GphF1WgR)TI z+t!2q;p+`c{n?u^T>n_?F{3&J)Lcm3mjB8(P={J(-l)J3qZ)dGb%P|>rb1*FX*Xxf zWV1Iv5yg)~Uf5{^voQNgp1Zkt?uz^c$e?(NkxEW|Xtm6WJ^88r^j14Xl@Rg8o=}TN zl$Pb6j7u~5$5`~av~0JI@xv}H6wa9J0S#WtGiXro+h({!EU<Vjk}<HP4v9ae8o0!4 zXD7jZefx(?f`R(OoMEPB$<?MVN#>Zul{4pckYD%>#Crg<Qq6Q4cng%JzKl+zB3F>k z@m`b6P*JiXfj+c-YRLU-nOo>llEd2NpDrcCfb`h?<&|$CiqTuWR^AczOSwc}y2}gK z9zf-tRQGTLzO!eR_fsAUPL0RaUfy1I?at5gykP_i59}*uJBpgy{#%}(W^7DJ`Fp9C z&5FaS8WI>FHcgx0pvM+>fTfrLW-<U~?iKw){KbDdsbP1^2|#(SmR%ZnyFR22kQjLn z({MOx+)Ts5+gS!4j}UXDEd_1oeB0B7zSbbHf}*8#a7(RD?N_wyw2ANL|LCjh)yU?? z9&EBp@nq#Z*VIN*y2UiAGPyuOEDe##ei5%z(fTN)GZPd>gy(iu_F4~K$*_|rU*(T4 z(|Re8QS2up*%K8e-sO-{?wl1p;&Yg?eiU>6mNea>9CfOv3@i4RCauKIvgkIe#A*Vx zzfE%evdLz|^8|sY?!)Csuw8wJyfLgNg@RJcMfR#<*6r=)0X`QZ%W}&<s^;x+3Gi+| z-*Zxhu1!LQZ9)P;_PRq(p`TSW+WciC(=uY-a7oaYYOKC(pK*BFbDW8D-V(9lijbU) zrqB133H03N4n3}e+rB5s{D>Qnm`K?I@LOd<ei-q}Bc?;x^ukMhVI_P4Wi0vFZ&*TR zJQD1_dZ*g&Bz)#SYWG#J@tElfOHaAA2aNq?p7ms1x<a|PR>^TO^%f%q`=ngj>aW%G z;Wh6w;%8r9y2EmXL0fQvqG+x9fy@JM7h>C+@vE}ITj`krjyhxbu7BWF@AvtxBU4bF zlz_lkz@eYXs^~@^x9P8%*Su(&h@hPPBT@SqCs@LsZ&DEb%*>v*VI1k?&vm_->sHIx zrCIZcyWE{?KYuUr!4*XNqMfxYeO~ExbIctEZ7Rd1Z#<IW20%G!SrdQRl(kvn<GJrj znzM{%z6LvE4B5x4)A(QOr;(tL?(+Z>RRE=M6?8LHAX5={S&1J_p0o0kbtoj@p+H{n zprK9rg!xHdDP}}qaSIh`D!gN8CD}72!VLI;)BRgY$xz!%VP^afYCyre4*~M?Cz&Fz z>Y3wbp}p#(zAzt6=K!bZuNkQD(Iz@xv&bHDivLhAK?u<&iUvK1+y8)>nHd=Ia0m>Z zDgAsArUEUo8|UMTN{l<%Dgf6G>fJa|3s+acp^Ni1k;N9}%(rc=xXsi!--Nbh{}b;~ zdW`xE-m}{Vp51pI8=gp=M*28D5s_zpvhv6gZAs+qcUQ9K$>qkP5@O}a49gC^FV8O6 z<f6_4gn%?Iq(kVd(u7>!htzqfMrV2uWh1+IyoVNcgkdN6FDOzVf?mWo?Nf}f;ACy} zf$f-q{2u`&{muo5qW+&Ms@>Le9GGx?nMO;G9CpI<POn-woT-U{x{<?KNVkJYs_YEl zXl7;jQf47VbCpdaAn56#M{kL=;*RUsa4z4D(rNNNCJ(J3`$1H0F!ZzvOK)JIzy!Zn z`e$#31(|*^NHLIxj_BJl9NKoch#bhEUGSf@c<J(Oq`JDg4mtc>9?c)+=MZN=J5Qc& zL-}gR$t_OtaghTZOR<od(fzhKcMMmU6BZGW)1eQ&Gl6vtUf(^`b?^i6d+LwR1ZLvT zwNDp5)(&i6Tzt18x%My;WpjvMotQzOtbqxY6plzFrmn}?J%(N|r1PcNVs58-Nj~9` z1G+%rg$SmZx*)xXyXG@WzB;yEe`caKk7eywY+DODS?t=6cxD(Zr%^ZKTPByX5AoX_ z54dAA+ze%?Mg)8_N?$+aI5wwVB`b!Nt<>^rtDiT*4*Y}pUtJ#Yp}dJ%!wC_+W-W{V zdBn8e;DpTuWiK-CXE0itUXx)p;^B3H$zdkDd=Oo1fxYfkr++=}GcUa=kAkw7FZ4at z4VXov5{%CuGI)CBPV1O(D3_cP=wi-dc+?c8y(O9jiccbre|gf6of>Z^>$Drk%hcCM za#ID$$_j)91fT@VTU|NR_(Nu+5?O)eO2WDYw$%s<q}5<-jb&1;#pO~JmWOXep7d5B z@#kJI`$r=*TXtF8>1HqqQ8kg5wKHNd^)LS~to=I*K7;LVx~be&?=Zf$rB3Zuy7c)= z4K_1|t5=I^YV4EZUD^OFGJ^b=T;Ii_k3xiZ4K1IUq*ZjejQ)CB2MgSge53S9_3fbx zhi$yG2Djg^F}$@^QTW7~tenXw5+~_&{Ov1awHWA($zgHz8AbVsSRn+_V-g-l^Y`C1 z9bGJ$Ape@%T*qPBsDB;WiwcX0x^H;Kp(E-PWhpBulr~4D=4^Aq4juHnWFP!d62h_A z5H`fAzrj~(dU?z+dV;^VK8gM(0i9d9y}uK)S>>sGs_6MKNr#l^CaH^X@<D8Iy9?Ug zQ)5f(P*n>zWOh&bB?xm=SL(c*3tDUP*&O;5u0)KRbxaq!LYWXS+()uqN><5M<;3|a z*L^;5$Im*oIlKmti?@O$^X|t&p7-M$C<X#0CKf|54IG8nMT$Rr<s&sD`lKjuwTzFr zs`ejocUUg{U^QKl`~61Mt4fy!-<4c=3q%At-=VJA1(&4xO=np7?!{}W*G<V-sYxJi zA)8Yz&YmA<D2Nmnju{r{MK3#&d~3;?8w)ZhPNYYl{+x4qCZ1MfU2O<Ti3?hfcyBok zRB-OB*~m7Sj6Q-f73$yS`xEJi(jvC{Jxdo4ke+bG%EWm{gk5rRp1WUTYo!@-bqIw0 zAYCXigb`&P8`a}(4FD|ZMQM&$uKUEh&UFobjqm8aYrgYE?FBb8A$N8!i+AaYj2b&) zJl52k1AI)cTKk_asJql#^U?d;DIB7LXR->Kp>X(m40F8e^&z?a^U`Bz(B=NLRP5$Q z)r#aHzS50p;a%mEcj8Wz2Q`<W^sdvM4r$Kln|sOO<CqvqRl818_}NNB1S3uc1_`(T z7&`Nv(9^{iZkg^VF?@oZW+poluf1R>%J8>h8v@Kez*Y$qdqBbHUf^bfwb)uhvL_vO z4aJF(deh#h8beV@=GXa$*UchF4ytz~sa~${D*-<=&TSRPs5O?{#h0-#`}%;LGE3U@ zElHk|ITv#f_JuOgvwFQ@AM@!9)1~}z8;tQwL=A%?{N~j&wC|jA-qk4pY^p+@`1Tm} zF<e)W;!_*3q!O!XcDkx>*b*=Zwff{>ABRys^(Ke<!ht)&PKc?*CZbPHULLXXGB|`Z zjBh<(E-f$)_~ruzE2L8xGcOhNvZc{x2aOJNtsiQ$2M<4Q$2aS4{#W}jdm#!^Hay6X zk#R3I2NWv|RIkQ-MNHnXVGakxE(?A=AC86;>=1uO83$`wGmWv37?Y!8d2KeTWSKrD zLh2KiDQB&ZOnc|=_!YZfqKvb^%Dz6q1r~WwZEbBno0OhG-b0`aA&<X<)wzc8>#WO( zhi#^5KO{-E)k|GynBp}<-ayF%YjU=Rw<;_mekgu9`DcsJ91RP+C4%93KDTp36jA4j zN9fuQUrY-P)ggQcv{=~M(a07*a+IE*&tGy1`Id9dBc+&3=?N4FC3?;69UUWGWICZD zw=7S>Ccp+nMQErHVUGPWNkG^|QK*Njyw}QsTt*`^k~jzDr=Xf0wTL-VN4@wBc54Vr z0YyI4?b`s1E(@kGLvwRk;%i7<1qMHdi)ejB{&MQnDZPGHf*aB=<xYqwte-se->;a5 zN~o`|Z$l16WH0kaZn7;{A;4t=OCqHVFrg6yorFt!%}`tWUp0$KJb!3L;5iZNG!ext z9~&gx665gYtZp24-)VwdF&3poYjbD{{;D0QszAnlP}}fd`|cs!C~Ts`0-nF>=Gu@K z__m7>OC+0m{@8Td?$+Ih)J_`vd?5voKX8FatqkP&M_1;*`I&jnG}N^ZnaM60P;z(Q zoXEB01Z)4=vXu^CN4uPiV(=h^y*&*dm0kzQZMz$*cA{ibFKBk>4AwuEd8{)Ou=`gH z?(R@Dw^TV!L<oGDzlP8O=2ECc{0f@YAC%JPt@g=O-SeoNjB^3$u`CzUW%un_T~daf zncSVPDA5sldzEA*MK)Sef=IH7#;A|NjL4ZACS0<_JK>SP5S$dpmofO~S6@CHMSP}$ zPCqfTIibS4+k?B?U??{z79#L07Jy2>OG|jm`T^~Uln9*-#N@or8F8X7fa8kTB$Vm$ zL+!F5)nAa~<d?^4d$0aS8G!$1+LHhGto~=&#Q*ns{O|LC!E0~iqU3-W-wtYAp8Q!< zY+;rQ`r_e1eyADAv5?;l6U4dVy+k*B;J@tL{_7KaIPSfX(o;PUDF=ld|2t^@?>+ec zp&hL4mN-<^*r=>-Pr6(c7H~@RmS&&Ty=TgL_ZTJh(>3W2n3<Ogl>@p>TW`K^OqaY< z9Tx5+VS902i#NbfVJ;AL(Na~m;kdFm_qqL=58hnOWH0{yG}^toD+ObB{4m+k!~@@Y z1*P(f)Rp`X$0Uawdbm=P_mU!Qw=pU%jt?iapr+W_RVgH7)cot0tjB~)Gd^~xzWygf z<bTqud{zq{0s@75ILlSQuZ&DWZO07<k5z}c;<PP1a%OvVunMfK!rRk#v6eDKS{06M z)~7KIlsK+qZQ7c}=6SdvlBw0rv+ipNqr6qy*s3Nc_e-E>=fxb^TGriN-dw2K9e#G! zvafL3m(Fr+j9J*E{Ek7b(?GxO)@R+VropYiQQjDCi|Ck`0$eP~kVRYnbVu5Fi^qC| zhx*zMomb79Mf5%b!x|{LQ4@ne4?Jma_{%W-8O4VG=JXH{{KKof#|H?`r46nH4ni1A z@u>xPB3kQ7R21qK<OFXX>|$+&59<h=-(AO1)v|FP<SN(_&fQwtU2PeeG_%^m?b=Lw z6l2eHe$n05-CZRWUK1V@hRfJ#abGDof%W1yDZzRyt9fv>l_T&P>Y?YXIcx2^i}B`w zz*0zFY>dxjz_IV9801|`k9tPFmAm_W>w5C0)y}BZc8y0|3G&UKZ5PrRa9dkjb0ve8 zc|z%-!t0^JQNOgWBIVVP<K3U2ly@%>-KuQ8^EZdnMm2Dkf=u6(i;Fk}de49GjuqJU zU3<9MY+<kYbfe61p1#()*u1LQUQ|JJc6RpVC>tBweal1XaGDQH$%S3@essH#pA~vo z&d~MFaaJgxWZ?V<>LYg?qBJQs6U)tc4sf9?N4L}*=A3>Byy~?iHl3;wcqQmKdGu%( zA&~}%)3uPJ5J$;N{wFDi9zlFWhA8I>J}f_c@A}$e#T=8$VVr?wO-xexRff%#!E&qJ z%3{qY&d$p?V0REM7ruLU_2dz_TUBBge?wFr(Ej;-|5vY+^WicA<uobwE~@ogVNssR ztn;@L{9S6ye&*JuT}=1%6u<lTJhPnl!<pqP-;m1+Zz{Lyt=JqXbAXGsD|r8Y!1%_g zK;JpvYU@8|vDutL96Slb&k2WC&p5Z*pMN{r$$FRfWx-HWucahctM<f+3cfb&Tf-}J zRSuhiccLTxp2hP`@ivFg^}xNGF?{%NxHCf;u3VQ9c>vPH3VV<3)M1=YeeZkQm^r35 zaek9m`AA7gi(@=>^+5fMKmPe{%v7&#t7d+7zJdR@H;F*}IDX<z+suq9<$+?ly~W(s z@Tkn>CAX@F;8)Dxd68H*rrL+29f(-i>}@*Zyf7g|c_(;b^M|`JlJjC6IF^^UzLO^U zy4fjCEzY}ntCeLYy}GkbNpAV`u{*(Y_Q;Usnejb9b$K82BTjaf+j)jplFdr6UNsM{ zp7eegS$ew!dz|i<v(WNt&g}C+o*20~rY1w>0qM=q(xf%HF~}|)`S=cJX~d)1?^1#d zT#)|GusQKmazn6X7PKJp62-TPpPnOrDxY><$8Kbq_D*nrzRJYpWJGxQk+W;4tnBPw z%YDz8b-2*-%*0PEOIUThMm~=?omsehvTG!J^#-?vkn1n+IbRjgM;+Z?QnP0t4%&;p zeKLQ&<jl$)2Yka_%d5>P+wjeu?|W3vke;B)z3|>(X+B_bAP+~W5?LqOzK`UdxZzX( z=&~UBV|fuP4VEPWH#jtPbh<6uXP2*Aj$<rydt?OPyHv)_Vsd6r6_>#n=`6JV+>|@U ztEO6?;(A;t&ykM`hQUwQ(i+bRwkTUEap%kyV=a2@WE@8n%GJ8&?7N!Ui?8uL(&HB^ zO11qz?7d}JRZ-VAiXDW~DIhIKOP6$)ly15kq#G0@Hr?IbNXI5rx@!ZnDd`3QDXB9z zKJWXU?|j$woqy-o*?&M>u;!X;&N1#W?t6@R-fK}mv&2ES@Y9!kq}2}j;y$sBKaC!1 zC@;}&H+X8|rNE0R)$4;Hzd_PH{qtv}P@ujUL1klk$=uZ0hfqBH`~xrDvJ^1HA+zeB zm+%xlszH<S-?0f&e|$<N=!aS6lc+QNI}+R~recU))A1EAKZ~L6&HD14^0)wtP^`cD zO{d!qkH4w^@vZfKyI41D+hTjeRX?m!0A2EOTd)pawwm@%+YQLXsBxqi1m+2c=-xzW zpkq#SyHH@hwq1)(MV1_C0-rx7*N!uc=Cr&^VC`HPHkY1}kznj-X=zDwKuxmDOdYV7 zJ8Cwvizlv3rNcdF!oKUD-<qvirv6e<z+(S-<2SmE@~D6NXG~76=Ok^WZhzJUhFe&` ze6wg>mx7|VWa83biJ^=Got4SerlfZCRToGwG)Ce1Iz%-#lrZvSl|&k1BK#cZry-hq ztt<_E`ndSe{HgCJf5F>LvP_;!{C#Vyn0?+=V%Jxw``tZ*uVvDNY5NxzmFbH=-^UST z^KF$W1?A;Y&&VfgTg=8^iKr+(2z`FrrYh7gawxV?hS;HtmGR`MuO9r>y7+UJ8f5p7 ziH}xRB$Q_o(G@Y+tY-_^vv$Otx5Cm-vyV4JaiT|_K>TA1huEnoKV|7h*3H;qk+l<2 zEANcmF;M<>*X6OOg^kTS>0+Ch98SF3k$#S}38D*rDhFOUa4Qhjv~z3m&%-0o;g!@= zkv%rv&xMbBGQ0LVrjb`8Z&DQ$M*%LsUzRxZF(~!*?<5n#v`B5ml?!hmtE+{e`|n@m zU|fY{OYO+)iS0PUF~39F;WS}247+PQCoIa@F!)+6gfT`ifeTI=(!|xS-AfEQiSW_~ zQ-Dq)J0~3(E=Lck1S!rxLdxP7z3;Q`QnI`Zj7Wdqe4Kv#)%HJ*!!d!Usejh}C;pr< za&s+rlU{6<3v@>am>~c8XrKS^SDGYKx<@x~L@3H=xjJ{k-1CxL2?fWD;bPE7g#ul( z_;5p=lx}98Zbf36b(tA-I%aFesk^7MHuV$I6~IKm=1AQc8jZwGOdy2XFULr|al}hX zN|yGL2lLHRT+I$|^a+yT2fhAgu(|TbtW%Sd9kz%5xm$g8zNMmXJ~)V4&029zjJ6(r zVE0tQ&US)1G=5@0+8pqwAuhe$xC>cY1FFwq5KYbgJ?CQ%iiiQwUPC2?>LgQeA1Fm; zc1@ajvB;uMKKhaaqj|3LgNB)!@qR>h&r;>*L(Pc4+xkA=cbQ!s>So^Ep|Iuslez1x zdLQg7u%L$;$8CX|7;c;bkIzMs4qJX^U7sT{RvwWW5r}diEBng3)}U(m45RZnVbR#O zOS?x_Bt)ZhYt^C|?7%lEgR85??Q3~Xt&6p85JQqxil*P+OyyL^s8h5=Xw7QPMaOL# zpNoZ8H9TqIHckS#OP}Dxy~P<eXH);k?=vs?)KT-!M+jUKI@P7eVx~K}eQd-GP3>&f zCoE8A3=6ibg;1;$p*;kfd1@OfnJZ{4N(Nu)v3b_6WSx~fZ!t%WI*6sX+PYtGEdBgG zp=7>iSmAIhH;P=oJ@znc!{Xrm%2D}V7B~Vd<L{0njt@7Fy4f8Xqwi^&JYP>8n4Y9X z#ohDJz}TT4c_Zc$Cgzy1tDYI~A5*jM@#zaj=fz>`Ol$Lg9pg*|fHQy1>SsjPqwPrw zrnZx;Od5tmEhl!inb!~V%3sqFYT>m>YHE~P$o!%UQ)MpnBWE#Bch+Wh&eKlhONvm{ z+Un~OaZacUj2a7gMV>3mviW5QI+2f_W~&oB%8@dd1x?|6KTO5cD=G2vKpogp6A8lQ zN7XRMRoEhf<lfhr-7dvP_9-<f1)yO_yxrITY8aC88KGUiJv2Preeu09RmvHr?KU*V z+0V^pEYuw#T~@(H8nny}UC1_)teK9G%JoKM*h@}OzrvTaCgJ^h`kXbG6#x%SWkSb< z#s{|H0I69mfJ3dXJ?fE$4(t)6q0_x?pRuaXwyBYwV&&?ef3G!lUzogH4ZFHop&-UC z=@ljIZt#TD9NUWH0Vbp>s(hP?nYqM=2*F=_S3z+8%sVw!grs&}3FaBDTOLneK$$x( z|1t}^W|gw48pY1F^I3PpW+`(~oqCFaq7uV3TbEE@f(k?<qN%3~m`2wnqX|2HU0GCg zuZ3YI=Lv72tKR-GEM8#`&<nrcY3|Iva;X$QKp!?2&j;s}EfJ1+^cG@ndSWT5kdjz= z+bZM%1=t@04~0jzk2^AqR86lt6PCFlElv8Lka^|I;I54JInRv4zx~p~hhyc;aGY1( zN%5{~9ILXp*7lyeqn%{7U36S!^DL|mO6DyuB`Ih)U)u$)ez+GIxn~H>55E$PfS7{p zEL#=I5jAD%7>Op0d|YM7c>)&Au#^FR?uj$RjLtjb>aPZ6){ZUDQ(729j%_WM-&y?~ z1lL2q@8ov2G@xNbTfRWqRt<Eznzh)2C231Ru9f`eRd!QNccO4e03ki{&;UJ8NZ1dl z^GxSFAxBbLQwT1<FP&SjUC9~q^3hI7LKqFc3qoycB%98yO|(Agu1He^l<YKy!QD}N ziUl=t`6(?Nive!Mw>+C0FOwwaouvNFxa$3vvzL6FL_~{<;Tz>U?RvRy=yO=cFZE5R z;>;CGc$IiP))5Y@e>xMD6S%N;7rWbBmCg4(ONnmjlk#b+<Whp9#8PS<2Xyqf{14x; zZJjUOS*}w$-l6UOW$>ORXrr4f|FIOt+RGrzCJ_u~Y^i;ThL?Mq3N@0MKW6F(UNd8T z@Q#&#w4;dm(~g!Xj_7J(AQ9Aal#|VzHmay4bX9+3)3HjkaNdHKcC}{uK2`>^lWYv# z$J~P<akb+9cFlR)C24XGC>GN7B&J4xkx`H#Oj|tmxP;hm;Hra#!IPF4<+;(Kx>3A~ zwBYztPQd;+FETQX(y|I^!{&4}HDgZWTxWyu)7=QQtEW?k4iZLRw(dVeFE04tW+~!~ zkird#No*sJ;=kcckq?>hj6V4$Ff6Mj<-tN=#zk@*W@K(IEs&l=m#k0(shCn3zPCj6 zwWfxOw80hUpu)>S9rVBjD-ErKyE#-ibE3AgGD~NPkZE?xrk<Vj`tY!7QUkZ?9c8X; zrgjEj3^>#ijl!Fm;&CE`;0;F9VDG9)6&I6QbXmaZD*t$|oP)V}5gH>-wRiC=wdeN@ zro%cWAk;C))*?|AY2Rt-)5pypm7R4-q@iVFRjB-|B|XuN_?zLd{{GiQ$6^jGNp}_u zZB&~QbFTAl1>G!g0;x0H``kmVSEmRY)UCDi{R~Hv^#JQBYb3_1`{t`tL96i^H$i(# z7rnad31Pu~Q)_GamLH6{$&#;K7^VVuonc$rV#=y@$uDu+wPWP1m30_Dj=n=Vkg?=U zY4n3`8tJ$0*FzY2%n^Dli5ou}u+^fcDaK>ht+FQxWQCu*rcar@3Eq-`sWkUe8%$IK z(2m}&6xufV^j=aVhUg3LvyQ$|><US*8T6unxExyb@bs^9!F`!9WZcw*ifs>K!JKaI zkIu{JGw8zT0zQ{3N8O4Dn+H|^$QsZTg7Gs8crHW?_rEb-?!Z7;hsdz3(EIv*f1A%_ z@PlIbNanU#i7ZF$f0~svkFn95MWh_Qm$HlPcP*dVj>{UdR9z(;$jDEnjdF&iabEjm zhm_03a5wfJ!MHm^T*R#PZDWd_e|k`yDpC0hOzv&yt1klF$y}?$%;3=sGH9(a=HqZY zBj89c8TqKd0C!bj_(4Z3()4<~ce6%SHeRL|k{i|MutYe$@>F*s$1~M+$o(LhSEe%c za~eNJfBYoWoCN!*i2QMt>{g!&ZsIdz$DYNJ&n*QyZoZh#0SYaL9Yq@H<kGBSAW;6j zCUzTOn1P^M$o<^BtH}V1)O+n$`!ESrfgg5BI%@mZ;99nTuIBrb#p`0u$^mZk<Il&? zS<_M#G-cj(SQu1Jkw6<Je?1X@Y2U<T7aLJs@^00@dHaj}&omoyWeL@U1`|7*yyQP& zw@u<v#&a=@v461SYLSQSUu^xFJg5!<j*i&oDNh<_rjxZbV=#ps)v6SAO-q=r2erce zEnE1L<D(a#VF^)J(1D3ukiK!o{)M^)%Am^Bt7qJ}%4v8EkAyQ%qT?mk8$#?L@5yUz zyOTI>ce$LHa_(S(dh{2Y)X^N8`azeuvb>}?sSOelUeTDL%?{tbts!GV?eK)uIxf@Z zuM(+`$9&snD<o=LRq>vgXliSxDXA4t_HMbS5l&7{=BwXJ%z+sV@jU@pqV}bravDMw z!Os2Xb78^q;XrlfiM86$G19mc%W5T-w7pU+d;5866N>M}Qx2PGqR`K6yk?}xW#`yT zRp2XiNEm$rvB*?o4>NLdQtN+;N@!CgP?O~+C;pnPaWov!sJFq+pmU@`d|ol9Trl42 zEz(KonO&sbr9z|L9Dk!KdoyOD_eMFovK#@Ue1c3ne3A}Hh>l&CF-m3Ni1}m-=PB}B zn8IrEJXTbHJ{G2z%>yi0xJF)>1A+O3T>AQI&t^wXe^9plltq~89<4zX)LS_-O}&FX zWzukoLumB}QyGkZSXNdxhYl+)F=psFdw5}}sSE?rpy2CO7n;@cWT`ach@xBaTMiLR z%SNQO6bHE!VXh=sfSG~X^#NJ`Y&!ZIu_MCAn-c7!sR|*H-Obli#Yw3H?`aj}Q1I^y z^QTYga2`zQ;E0DwP+Hd?uAe<esF&o`vIJAGP8oV$7<7uZlRp-VD6&5B(haxDIK0by z5V)}ESVI~!$B~9M`VsUm`8F9|xRcg1Uea4F<UiWI<^Brf*r*y`(i;ivkO{gQ2<*&& z>;{RTyYC*G<bUJ0p)~Jh5K@Pec?=+vOT#`lD@O}LZU?h`b;R$u0?0OLYe;jlvN~DS zLHm<c*rnWSRhe0}E<HBuL-&{x5!kQ^C-9oAJXOnCz_x}-7+z}l<NXcwxgs0hT&h%{ zND4{1?{5*7-t$##BBd$pXC}yFK;;aWCN~|4!X$1(n=py?BYw<r9L@CPGow!I_uY29 zY2oW`Vg29>w~#{H7y4@}UYUp|Rj}m2wGp&Yma#r=A7d(bRUo!zwqRsvTF*j)zMv|m zK!|DE%x-)^9m$0K*&;TBq+&BEW)8sz9(RM)WmXx-HQ$?_>N4$94o>U1n_B0&b)m!3 zGKqM}W>}<&z|OdKVkGG5KqheFS50htn@l=irp*xY9&=_e@y849q~&I~cU(OA)mwzk zlY7nh`9VnQne}o)JXAx?ywedHp&Z&o%xj|FvJD-Y7i|6TZ1Yfn3tB=Qs#*{)VbUcY zCk!=`B-W$zV^OA|fli=^;ZWzdE6PbMEhdLniUr_Cuyj0B*bz@m41r1%E6Uf;$cLl` zGp5T}M$qV*<W9`4nYkG#;3rhZ3NO;;)YyGLyr2C3PJuwAs{$=5Pnv#9nPB!4FS6`- zhu{;r#vrUu%iN<$sNiYc24Q@U+VGp5C9YI{Ba()`?}JCz*b(>}!CxZr8Q}x2vhNQa z47P}qqJ`3YCY%zwSc~b-XkFVPUh#49+9Xya^+7npl&BN_?x%Tf(bikddmR>KthAP0 z?nktFzv=dtp*z^p)L9NfjfpBB!LB%7nlcDT$@`$K9XG*ZelJ3zn3}CwOx{wzSwzYb z>fByqIaAX=@9GrWAje*$6jj0;+?a)xF&G^?#qrSZ^DgJl#fxhM0&@9#%NcfbbVQi- z2_(BHQv>&2raZ2G%KSpcquFlI#KFzYZKH+OJ1KKseY-Gpg$^dm-bCTuUendB$v0+$ zZkq^<7}m2o5<5l2{FfSUg=RH_dsqqRV<$;JkCwJk-4f9SBA4W`o)k5GwCEzmB2)fs zStg-OHJ`Ua$tWtUJFVq3p+@HU`}5hkO2*%nw(GgrNxrU9=#bX$WS4#)PNBA()59DD z^Na3H(sT8T>k3?tDKOjSNtw?dDmL-i$XI4g!tIEjr!{`v5{mm7kiB=e=j;BRn+sZ< z=$p&w3tLUAf!Es^UPq67+$ngSc&+cb*sZGwd3z?y6D@=f`uu+8bAx(NJ@t3z^ZvYT zEzhw_(an5FMB8k{Jl4(Z(YF`%bH|;Y^<gOvww@b-kDi(tV}iJR9i(-8d|LmE$%5wH z9w+ik8gG+X&_GQz7BklJ)MG*qm5MerE^Ij_W%fE{;v_qsSLK^4+NL`!jDJQ*W1BNW z7B)vLOR^M$n?ueX=plAAw@%DwjC7T4rmI$W>wS*ZxuN{lYa&fqm?=hBaz_0w(hC-~ zaD0hHG01wyp=}g(1P*iYi|m&42bf{h*RRU-E#Ai1F~k(s%-Imz;N7%5HmixWgRgh~ zu31_Mj$bOWl;BOZi^l43`}UFDZNI<2pO~2V;I~U|YjnRBfgoOYsbTjMI9y7DKi<yZ z)SWW79jD`nnJ^OUU^O41tOgkvei?c+*wtx%3*(%`)zS#V1Q((RjOE=sPlcs49$r=3 z&L8;mtqSR$ZL&LX`E=Q(!dm~JGuFlU6yFzVD&jeZE2du$p(2<EMVn-=PEUM{#jlTk zIxtKPYj1N8k@;K|;?495O~+B@I*c*1uEasQ^77m{OtC<eeXo5G?vt<9+XGYZc)S%C zrJv7AfEJ)b%)7}!M#w=OZl@M$_;D`5jz_45A>hQ60?<Tn-7v@WpN`!va&_g(ie#!x zWFifw#v2bCr?L)&q>=Ny$Oj23UdF5$pWibs`!kd`A2w6NBW=c*ju$*me+(}lr0Nlj znLkN$F0o&8JQv~BHzN$zh8!&^CqOJIBwkG?%kWihb=swpXczDHeH4PIClag|`7;NM zF5-wZF`zC`0*Uv_Y<B!H3HPphMR~*4DXxC`)THNw3(ewJ>tpM6<^t7QLY)m7nI<(~ zhU`)o7Mx@L4!@XpX3V|#o{?TS+n|A?AUpYVc6cqtTk&c=ETcsSbyLUla`iCOV=!yZ zCV@U~9L@SRuNuGj66a^MC(K^k#q8~ep#qOd{BYo#^J?u*47zmNfAy}dy98y>vc60c zpMExLmwM)huEW><4(sjU<vB5a6_--U)|U;qd3<!D9mR(ku-?1VHF-{E`%;05S-Zt! zaebP&!}8$osOIr|nY&;8k?JPZXxMW8e18OnAivPrZ_hNWri}srvp>!UeZquFG6$M> z3d^sjxguVim!I^XDXF{I6`Z7;m9eOlXTVM-Ty&dv%b9hAz_D-+Q-Xim@15HtSHzO2 z@v`WLWj;k>J=Aaakz9sK73s-Td+V;cYM1*@p>v;Bkl(~ZW6Z(E3Os!$XBvc0NwYG$ z$<^Vryl-JfnD(`OhV+KI)3MeOxK#Y%mYC%LF7K1SB=tlPiG)gr;7xi{pc?h^nsa9O z#f_|Gmm6FRRD3<}J`5rcE3RB3JlmG_FfQh48*w-DYWS0CLd$RqfgNUW0Z5qNx~bW& zbC3-`+HfIFK?qoYYfOJQs92)pul?hAqC9!#R?NVUZLbJz7u2eNl&t~zF@wGQ2=E`k zOYc5-2>765RMIFBFW<LDSYW-y`1ciMpt1>27b^B!abPT2G7rhpZFifPt(`PFOpCCQ zE9v^a{`VAOPuF>829^)NQ$W&^2eoG#+h@E_TLr-OyZ)u92wt;Y+kfbyn4b>GXRn<^ zZgx`4eMFLD6tBe66s?C^4!VYQd}g$Do8dnDo!z>4LYh|dCHt*r2lL~|TY()Qdx6~t zm=}E(f}b`Iyl<P=#L7Bzu5;|;3BCDejl-AEBKTA4Onhf)JnC0<NhbN-XShA0A~C8< zeNG1XT0a(?^l!j7!LlNpDLQJ<b~r|#=5xe%5j<Y*#&#)jTPzq3(vk~#T-z4j&hLJ4 z#KrV{ED4(-+oEkp()Ku~-pHyzq1Cy6ZEl(`^`F_raw!3eGWc<IO_0)srF5dC?Y_>L zTM-09^#PJSds|z(e*+=lC)C|l9rrjN-HWo7FOc?U*OI@RXa|Ws`JtF`4qU;_&Q<*> z*`>25u4<z^B4f5QO~~VBx4`?dVVN6PEt8FSHXNA80;!YXZpAVS_+`q`>>9)2vgW^g zAr-9`OX-jHq%`r5f!#9qNd^N0bR5~=#XH@0U}ZalU+!%JOS_oZ(ueU$`veur%B`BN zFAw-Hzh}eG1o<-(VgBD^mMW^mhU`dOy5*rIX%>i6A8WIC)3n}ucOs<*zn+j?)+Hwk zASa5lt}#U@=7iSTgsakc>B{>k;-t%k!6(VN#0uJ9XMhF&Wj=h*rLbsPRER&4pi*1N z#Tt=Ov-z_l9(oG9Acif$;@ae36Yv#HY`kxYNLKFc3#xKwB8}0<R;d$Ip9n{VE=)Wl zi|8p|hZI=ZXCQ3B&7D1~JMMBF^mpt!7*0?A(B5jpCo?MUd)&YDDrFYzV}WP0m<z8f z>E$ok>@L{!rq%VvZuveLs><16&pYVtU$F{XI=(50nH&^a?yU_|;f;5W*CP$wq8B)R zvPGnG?&72)#i-TF(twXu)j*GyqS{r^>1wtX+Ua=k$NEUV&D2$QNoS{A>E*2R6#jC= zkmvMdM<ALW-R%~#|6sF#dt+=l<+UZ0!;YU1C(5K)`i$VvSjdd~K~1A8V!*1+?<Jdp z%cSFgpT?W!tQ&xp;y9Ji8s*hSAkD7?Gd{8@!(n1A$HEHPT-ww`sM;JP!@U2fBfR8w zmvSzaE6CJt2z?%-bRkb`_!bKcgd(8G+9ef#0#FZlGY)355<>9q^nd4`^#7`5XDbL9 z(t3ZKAp_eLn^C{Ft>0E@ZfQ8KK7;=K%`EVV8T{8Ze^Y82=g07zN8Wv{xl=&V%=bTJ zY7pNG2OXYDh(B`2m?w=N54WEC3CVja>YGp5?cDVB^%i(%U(;Co*42Q{EgSUKv4wc^ zP(FEB#s3g;(&kPF9Y639%on4YYz44?J6i!co`Ab$6%||n|K|Y4`aebpOQwHQtZ9W~ zUqOy#|2*S*TNUysRvCdZY$(}UqMiX2r^2d#lVr;Tu##Oh@@HLchT%?eWA;r*{x`YA zwqY67g$CmjqccUbyDV|s`5zY@CE2bH3M{B*@&MOV#JtSx&gOn|W|!M4;WmBVsv`yK z>`h!;$DOZvX)4d?gpltse`UnaLlNW!)@ON;$rC_+vo1%)0Kf7*a2+PunkTP3C2&~A z;#%dgqIr<wX?)5!ZlUeZ=;{D+3QK@R1_t?QyfPonyRs}w(ZC@t+O6fi`FdK$p_%%d zd9s$vKVHvf`L5q<n1eNV`ET3T)wN$eYP$lqK-%gF_E=Is-5HiLd}J|IbUg+8*ba&F zvg~wD-cJD%`s_FRqk5}0CYXRf%zF=^vEXF~_L`aXe;lN=r;?nM1|y)I3l!3Lowj0; zyf+Dwg#oEh_1CT)4wUbIuMHi45FauJT>wt5bVF|DE$3g{P#bEa9~hLDfPth{e%pSb z+u(5X-PTmK-Z<11*M}E79b5VXB-I7j^cfc$2BXrFwLLEnEKToj{Jqc{5PDNi0}oAz z+Pnu=2Czmwg*N>J2lU{Cp$u)T19eC$t#i7-<b+-7X17D}%-&xE16r_#WG)jzoyt=E z{U6N8e`8h#=enRMu|i8Es3s@xu!9jj7AF2nl<+>0#`i1m+c_!);(y`=KBx5@c9CuK zsv2068){3&RWg0aME44lEv3QUoXETOxY}UEY%x~UqpfOw7v}VWzH%V4viuwOtudfC zS>EY9W5!0^`}E-m9zixdv<dgeUmA<%XcY6CCAUU)f{}+Ty{lGv{AhbUi;T1fTrh!k zdqKJHe=bmyS}Lxc=p+1G^%B{ZtuPTpROW42p%{_re-D-qI`LFTi^d6$#`E%Q5pNvK zDRpXJ+h#8rdc4Z%-vxg%s3J0ROVZXD^kR;0H6I_y3v->QYAz0XhMgaZL4b`!7;oc% z8!NfWKR<sFJ9>w{lFmXxIxT1n%lI|f!JrVLWM15WLLKyZneG^HDc%gNEMu*VHvjKS zDuIcG!=yqhs}e%#Xj!4@#rEw^pRtfi1(d>2fQVcP=j<<-7{c!KS%F2!4@|m)YJA=x z*E-W4{5rNSXbY#S>14Q-kU2+%U2Mexn80UOzu$)2f2}!RTbm`W70N5~gp^g5gLtqL z7zb{ww}Q90rT4!G{5Q3p_lf~|fAFUGA;1U_*q)^o0`aqE!e?c=gPNSJ1K$bVk9O__ z7xYDW;METzi0-z&E?|=aGW7sr=%P&lzAn8Zs{%QXFqzPkrBO{t>giugWpz;?gGr^0 z5b_$F;L-x#8yTJ+BsXL&g*3JP9yfgt=Lke%wwS>=V4|@DTTgQ8*_`7Db~|(GYq)LA z&=HI&E(=v|r&l-2Hx%C70R6KCEGR%cm8XC={aR=#{=b{0qJI(Jqlz@ln;nwN^r_|< z{sPP0CdG>|sf}PuawTi%XbQn!-Z5`1*%d6w=(Uu4d*#etqLbgn3lyGdJ8*hC>~c{# zE=z^ZcZ^I$oy5GSNFZqGT;@NjEEn>*VD5aT=(FyBrTd{Wdc4z69*Z}V?%!<s^3~hx z?f=IP0pjgud};7tftJZ?i#uFjz4qiU@p6rYT*<Y{fo@mG@)DS6qLjKm$6>ovfUMC3 z`I*U%`#!fYJUU@=CDE2Z<~ASDP_XU>50fRlDDS*iQDeTgWH&!FyxXw<*@NAgDc)zo zn=p;n`kf*&1Auf~^RwEs=hdeRG+D$dOMPVr=gU=r-@8<bYc?23*ONk>t|D&ce28z1 zo3pL60YF2#FpfziFar2PdZXCJ82RZ$;@XL)o7r@L#10_7XaxZyQ36n_&=Ki4>)4Nb zDjl{4%=?4RIr$Z&rfmprN4@=*1~`U+7&}+{F(OTNHO_MdYg#mD^@<@FN4xUNAZdfQ z0^mwZ0-2f*Wq3IWUxqO1JjtAzp*gahO`be0t@$e*u=NfccQ$XXI)Dg)?c{RZR;#CV z`wbGI60iORw$^Q{J7Rr?SoLb7?2PTP!fSci6a|H`cg@I$Gfyh5|9x{gal_}BB4&Fv zo#)8~PX(zD2#xDbV_cxOYi49xd$zy7zi1%F3tq<_XVG8WxKB4d7f$(Y-Q1>)`XzEk zHyz=+4~2iZ={Wi2nhVllHaYR8kv|SbV0~f)VQ%_*l27}cSed0%N5Z_1gTwnH-q;{R zgNBLA{{2Bk8q_wXWmpV5Y(@2&a@fM*$SJ~8mNE0*bC;bSmt~$byfAb|f$?|Ep=v6i z(b&SqR944*=Pen`i)Y>`cbg0ZuF~+Ac;;aE-sRU0W-W>FOUB5iHwlJmPaVLfr4C<z z@v4yzEPk8mW=-u^2Ec<Ty?rDn@<61CE|@5+3xNQ%!$)l{55p)zz_fnX13chEKVgD^ zt%vFBYD@(wLw;EP!4yviFy;p8Nvc^P)0z7~A5`%HuVOHF%fddJ4$ItQPhiEPRci{@ zG<3^b_VYAOTH`OFL#w3>->IS8D`^I%m>DikYw)z-P0D7-7JihTE$BJM+Be%h=7R5D zNv6wdE{kXqWh33Jb<HdbuY)kZCr7%ax3?q#KX8MdAmc(kkBP!jiB;X)Td1IkI$;#t z1b``JU_*YG6E2*anX;d_DGU!4n6hc1i+sG{#t=Sa8UR3R>li-YCmbg3fBBd%52hme zz^{LUgJ0iXFOL(|sxC-US>}&n<E5qU?{7^224$?P9axg<vu#QC{2C;Vd3Phgn4aQb z%MSulWR&vLWWZN~hUu+@Q{L`FG)(cRbvK~9p4vvAYnZTjAe?Fann_7uIHnDVe+Y<9 z;=13E!N<nN4i68vwzft#20i;pIz*jG3P8COOuO4ojELaAaKZSK_OBv}y5>-g__Gg= zhX_RS#>v@10IRt2o3#+*fptLT0-oQZmVbB|_lpcc(~imduT$PfZBv(*;jZ-f4;5(? z4rsU(7<{)$F6R;3wmmOQR&2tT?^mP&iR3uXSqnfk)tU^>V*-banB^b^K;1eB{K9cY z_7zx<&!O{|gPMgY_w0id((o+V%-Xng4Hr{<YBUC}<2fhQo@39y;5(e#d9An#+(v$$ z1HOwz85X7HTxVJ~@BV51oY>qf&Yzrm5TILd(k3#8eg$qxY!!0y#=+nRtf1}4_dFUC z>SyiZW2kYN0CI}fRiX!aox=x@=*C)jP9-ogY+5%@3sXe1zw7<t4&T2(^a!ntKYsyH zIBbIFMja+@IKy_G`F1P9msROLeLc>E8)WRrx*svh_n7iue}qS7iwx`W^b)+ej_nnq zzQ)aw30;>E*v32agyLI$9^O&Y&_<r-5|klyT%>rty<m+~rK#tceODor)7sm++y0{b za77EGMu|<CgMXJ>S>JneLHfJRXNmfJIHTpJuCpt#0VGJp9@1KD41HNQ!dqk7n_aXf zt3}xbNc1BbsVfL@T|5_u0HCsNy#X$ZF@<Xk-O{joP3SDJ?Z*FX+S2Rfo6COvL^J@* zc$jg8SsvTNOuZWx|6*Ywi;2XZ9v!dK&YeNAq#1yzSQc&oyaC~5W6$&TSz<78_}@Rw z0=+3x#(6%RpyR#?4qC1MnLeGRlOn|dE1az|?w_%}*?OUm<pyzM*fgTcLEn|DuB(eb z(1`j770$u!bFnzj95bSF9O@2v<VXJ7ODG-kLrh|6nLE6&9t%j#G}fNCt-zPvQm=3> zz{Fu5J0?DWF6+qtn6m?zm`0-PZVdVOn<%%i3{S$fXk@4gzkdXfp$SVO18q`1^sCk7 z=Ty;Up2fe7mgx1Ww)Ox_pmF$vP6UjMjCM|1#H(pB3R(+b)weVBW{LJ9cIy(y%RY6< zhm<`sCS?dEd|Rllq0#xwyx*^(oZ;bLo^VhhPB5D;iX(1MXNO>r6-a$=tw*9y5S(DT zmms}3oZyM6X6HQQ&h7c91Hh5ez*`EmQ*%EHHZZEaHrd{~!GxqdUkIK21Zr3w?;X}# z2Gn&u3W%HRZ&?pttf(o}<@f(3ihZ3cO`swz%Ij8Sp7^Ll_UXXzuwt&fibnsVxUSQy zEd4J>2i{5rD}FDh&8JbR`<0||7!{g7IIT35^GD0a=PA5_uFIyY8ov%Vhakm8H^)H) zojvY9Y3-3D43PS+0I6@+bl50f=tDUz{!@%6FI2Z;l?;)hTwPseBMkx-LQ!7Ahy7Kr z;OVJaw&v!q4B2ZFht6HYp;_>~J|-o#km|&12ha0|@d|tASj-ey?-}rB8v$YvS|38{ zSxZTN!T^%+m2@G@f=N&`^*b{tiO#Io&)46ioC<ub0JtjvDcWJ)hgtWZYy;laSov4N z9>eWNd>4J0ZaLMF4ULnTtUn!AmJOB+)~do&&_}{>e{eN?Ch^WlaIb*wAsn&k2$Y^l zsW!<Nm`_7}_kbD}sTHOi?6}p)09qBP75;ljHg5uZLKoGcMs)b$J@P{b9#VIcesds1 zQi<4tgy&7t1gD-3qo{>sl3Y~24LhHkotxaB^LL76#KfiDNMV4g^yeop`tluLroATB zFJ-_oA7#XB|5CUd?sku^O*w;kQV%J%C=U8kKHrBs7FMGwBAP{-lfKR&$Ye;XjC0;v zVKZw{D?#M7iB#9U!&eK<1`Vj8En)7*d}DqVxk2?TcGZVkNeMSNIWK6J$T7R2&fjtc zUWG}B+QgrP<_d5eew3L_$vP}Z;@DWZEB_Y^?(3=$C%7q!m@@p#RVmU>iUls49u@t$ zqomFbaZkQ+Etl9FgT~1<!96h<4c$So6)X5P{Rd4=)PH)}aiD73{JG4f3uu#5hJ97p zT7hBI_7Tt7J*j6FECk#SWt(bdfN-FUqoD@0I1O;tsGg2}W@K$`JvN@#yRIsr&{0W$ zU4h5yuipUF5-)lYl~fs+yHKDg0Mz9{;Q012yr5#*3P_D2*a>D=Sc0B0q;LE_`s-x> zndZQ=$j8`55{y1`#dARsRSqz3C$v$xW>~VnJJ74gBSHm&F2TJ9SJr^iFj67JHc*GO z<k4sM(nKvt|1xZ5w^*ZK*ID!W)ni|Y9L$KHo(uRNbe3$E+rZ;+2<bTR(@i<Gvye3X zsoHJ=@MrbeW}gH<>H!KJBe9eAl+)b#x!Z9CRCcl}qhZI?4QNC7y^i^=MtdjadA}f! zC6CmU5HS^jeT`-f8Xy@Pi>MLcmMN!2bdk?y$KJLP>ibH>lu`3V0Z7jD3Hp58(+lzf z0eJ-OwZ$odBYrFVv+$7_z$h*35I^HCNdM6b=OCR9`a`=LP`;f^($13}mAz*1T2bCU zjIZ%+jB6`=-mqgEUK)4>axDL`Bj7tAZUqw8puMWOr|=P=1R(*2<iPNsq#o!%Jg+Zk znQ!Hhdcr{Hh=vp^9lXn%*~Ji4xGIUS;7E}t^h3(4v-O;uILhuE2=(@JVP-{+;O2?F zOlzW8T=8YZ%!6i(OSqcj-$3+sbn-hN0eK|Fv4gfot1BG;Euq|-s{whwQM4CWLbX$E zudn1UOSYQ`&4+*}rY=Kli?qrQ>)>v$7q>YVNfXKj{9Ot!c*FU>J&Ova-}x^eJO4kJ zQGbF?0<cg2&-|y70Hig4LZUE1Rrom--XoCqvY0ZP*03g!hKe+8QZXO6ZD5%{DsN&E zqZoCyUfBVvm0DqkbF*=C<CRl&Ba$RS^KkWVaq;}c5dxHS&}=ER$AcCdEQnAb3ghW0 zQl`Z(92my8*ZjB+l=m}bg`p23_K5BP(IpV8RC#%C0}OAIzHqCr1Jc5osw>`^vbUXP z=ZKMzw6oP{$U0T$)VWpt%xYtFaZS}1$MME%)qUfQtxAhvQ!Mi6?$vy8#%Z@NCk30K zpktId`;ZvT11-q338GK}Q>%EvekJo*&6%J$&PHCp)KP#JynpZ3B}@6xG2Ru&jc@ZH z3uL$PP3J!eAv=%^mwE>TLI27GkW~l>AX$mf@pcC~Kx-gd`2(^vWZr9l(2>YqwLIuw zKlAr<){b<IQiB%N&=<dcnPy_<8y~^@(P=|DZ(`qQ_=&g)KaKUyM^E~<-+@n&HIfr~ ze%~DPorH|p`)_#?-^^2>Pb*jz{SgVo=Ffnlm!aBwGA9OA^>7_2a(kP!n(`BZY-?oN zCsWh3og4OnwyLg9Js_!gi@x<~!MfX=jfNP@(wnG;!acE!Sa2@kzH+Gxwyu>??)OBn z3BBh-ZE5OW<(hn*+AjJW-s2S?9}o8q&+w<^ZF_qKreoZ*71*3UY$c?&{i}i5A7SX2 zSX^Fod8#F<h4E`VO*YgeOFPiUbNJ}C5P$+ofc9`JhHbN2)bp`J!MOPwqO}F7!U(zp z@_k2Mpd0rXFsUpdG-EisFPwSoKoEeyX3gqTpC;YqLfX@~-E-uPl%5<+tas$aOUy`w z2_|Bh{_VpWlNaW3cw4ygqXB|N8y|~j9`4?kC^Dx)5O6@Jl!^#oqFwTA9x55%G+XEX z&=MzLQvG`Z&AlbvRcp%z$Y==yfWqHNd$*Hq^KJ>m8CbqQp2}QEZ8-erS)4>J-8h%% zOKl8e;tNhn#IAG26tNakWa||}G!4|duiyeeXBx*QZD6%FvX961w87{&zRsu?BF34e zpSWaKboDxUu_Jzew77;Qt2|9B^PPY$_|quGQn77l#;_!IP?xX9w#vpU{e=?xDtg^* zdP2wiftCTQ6(Mn+HBEqpf&;-?EI2_jX=~-~w0mphFd&!!koi~j0wB%(Ipgp>A}D|` zWpB~b0Biu32Oyb5qz6yGx$a$W^Ev!i`GSi_hQdH1=y~>gv0cRm4Qmi|Q`{1v&3o~b ziw|BfAd?#%Pw_xFQh5tkclzyRro3-_LsHyRW|;8@_OadveOI2W-@o7nk_lGY|LNGq zWDSl&s>p(rLvJyZ;GF*xo_f3MQ9~ha*a4`m0e2VjJ_KqB>-xJB@!TlRfX})O{*AW= zg?+((M98hj5S;U`8Es9p1XMh-mRkVUsw}?e$q%@0d3m|XER`SudvNv{mD#~yep~Wq z&di_cAJkibA*Oa~MNT2gGJ~uXm%pX#F4Nc+|9GALcQF|J6AfDX<`>L^IGZnLBo@;u z_Fa-6f&+6Ht%w$^`br3cRAn!LR(yH8T;WG4uz|r|t=ygR-9Yh0;OTk|WMDwxg8q*H zY5QL*e{Q!l$kzg-e*(5O0x66^@+=_KYcRfW0{`77)jR-=C;<3HDkYfdfJp!Hm@YEw z^~*U&GyF?YNsppOx=FICeY15UH4w&gEl1d#x!`=L{wU;2tN?)u<1d-g-&iR{Z8)0~ za2f6#x<h`&YX{yZX)JtT)f{9n(^Mndaap`RU^C6t57DBO(}jxg9`{Y>4lq92<>mDF zlS=eR)$)5s4Y_DoQ~v8<iiK0jv`0$w>Wuhogz*~}-Z`V7>}k3LJ+Vg+XR!UnF~j4i zGOB#~1O4W;c9!l<BU<$mZpVx19U~BJ(rRXmSi{#DG{r5SAYoIkbg3UG<{_M7Q^ql$ za)D|*ZLHvBRYh9*+x|_{osMjJoJpHSst+=tqptkYNgW+NP_bTcNZh6)Z_Yzri$*1a z0Gd^i*8TqJZC`Q%b`iFs2AryASi2SGNkMf<?hPiy1u-wEzkxHFK;hBG9if!89d3n^ z39PgoxAe|b8~CkR`&KFqlk1_=2)=p6S_n<}P*}@BAPMH7HqkiMl#9;;Xcf2eJP>hp zTKRew><-4nM1@7dq6Op13|;_!5C(JFD^YFyxSxE<u$9?N{&+z2RK0}SX2yzha6L<9 zD&b>rP$b(DftwB6E8}+=Iidc{@WBCwW_uTVqzFs2d8uixd~s9|4DF)T-QwK^zvgG= zO3z^pisQ7};CtT4nKJ7*qdy<+uq>?U=Td%}scm3}89eLf92SFjDPP7}$SH#}#^V;K zWgfLVl2rv!Uv*25H|yCd+xeovhX@X9lJHW3jF8iNb(o1o6;KI2A3nm;mEF=dZ7T$6 z73gC;<1!^+feO^kF2#hrBf0w=j2epbUvm!4>&(42br=j9uZI+J4?Qh#Jj}(t@Z=%$ z>7#FIK%UmgqJkw<-;Y_k<SzVTwOGEF5t^z{vh&23oE0ux>K!TVOYkzqS%6L^D1e`y z4gln$36cQCT=xljGfB;Cor;3EbZ8N>o-AkdIp09}JLSNOCl4@u`cE^*wnE3UC6v?g zhTCf71Pptc7UL1pFAXuJS!WUml<7>a$j~0<Og@#TbH;3!(kzQN%U!g~j=oCOt~F{% z>S~Tzg*zFzmLlsK!g-DCIr%<ptF<ePsJnh7k>r2d&d0cN-lv|GTlH|V{TgZXkdoE^ zr66(`xf212?tl!`H1Aws3AKa}0c?k@Yo=WCF4~t|iNQlmB*8UqvB`$t`myiZ*~ue* zJ0LfD8dd*30}i7nf^P3XsPHr$(YliQ7^|INbD<QN3#XvU1U^y^z%e7E@vPCtA?k8y z<clL=apjuI%9rEqXNH)S5X8NKR+nSDB->Z}8jg^qejRz%4phF2S3S2Sd&NCxz-bzl zM^VDT&eJEbv9b!1k=S+Hztu2N?yA>VZ!Es#HG)r>IMecx2|0?dha$Xd{i~;Y1B~kw z-f<?-<%~b|Y+Zs^5O%rYH*6J`fKD&>4vq>0k9&Qt_==|k|5fU=flw~%o`-dP-d&LI z<qdZLXaotGtcy@?D|d98k4d}j08w8*7hg4V`swBho4MEL8c<D)ocUNl=C=dP_RoKK z1>vmOXY;>hBoL`j8$c?z+YYIIy)Ai^Ez;&c9rAM_jk@vN%uBFoyK)ZMP5M%!L_v8K zlt9>Ntc!1?^(!bO*|lUvEmS4J-=N-V`tW3?&7C1tEZy07(St;#e&&Vy#$$g+NdSvE z(2_TBoD2m)P@p|E=qffH#?^MJJd@<rR=$nr`Vm%MGM|R`BV4S89wysuuGwY6!)vsW zj>_+7S{E~D%W<tNFsc$pBCJS=RC_8hw&*Cv%Lgr>G1>t14?ur5rwHSNk0k3Z-s>*u zu1nA?7kZ|THN3RD-rEU%^2kV&Bu}Q~i?!%#P&?GnJbPoM@72~DSx~YHCysMWB%$AA zd0DL_{YW!kNm`?vXY)P}!foJ~;tuPccAC#~|C|^G$)TX`YS20d1!YyaD^j6ci0#@v zD$&lMGbvOJheeRL(Z1vTlu32(zwS&*q{e8QB*s^m!JCnqIO2p1@XY6q_N!;D%&j+4 zTGGgeK+%&0DTejK2S(5Dm%>9xI>y+)NMeiAYDCfHEPp^cgHKhjY^Bzl$`ezEj`;0) z6GM;@D<-KtPMVgo!3W254&+~<%Q_5j438d%N>`v@r>`?IGe3KH&mgEbE2bpq3$k_t zrLPqJGA(Dtu}nS0oi~x3C;2WItki}KvVa(@CsIN8AE2Tz>y|5?{8tzD_Wxc*eEHLG zO3SMq<W4}jC4A&Rd<w`y0pfGZ#LKG%>3D#u{PS|GP_6?>H!?|CUsKc3pn*aVN074f zybOu#-ZpBR!|voGc27VlBB*z*n5v$`1MJkq!~`gjT562u{_|^I^cliN%qi7<^$93$ z1S=f9=pk6)zFf3TsC2VHdvB9bAcc*60I>Fp?rCI5fG#`-%A!+wo%);^e#^!^*-Y)H zb^>MsMeT=qwd(&{k#6y~|7ro0Isf0K!u|i}?}z}LbOuMU$NhmSdM@Qe2)?`S#TOFw z{4)y5X+hmFL%G>cP+Wg`(dM(lJp?M<M>FfuKnVHpo@6SF+Y+662y&RAF=E(`#+0#6 zoVp%(zOmqQjF}B4k=qTOQb8+>fe1aYwU6!NL%)bv>?vH0kB-K~Xb%$>@PzbXjK-&< zl4M1;Au&DU$Z_&$+jZhLLy@mJQ8=anXK987`8{Y&@jsmj{;MxQ#`T|q1)m$QgCknO z1L(rvvW2P{Qk1uq*SibXySmq_x+_8zO<VLKVZwu#%R<*VsnaJ@-n&!YS5rQ7HXcf} zZD*a#E5o2cTw8CdBVC`+)wmFHAm5M&0U<x1jL}?>fjjuTrbJSY|NnRY-_C)OMC<;o z!b15M1}M<Y84b!VEc8-<;Le~S2hwvZ3izZ8%{DVM%o*RrX;8d%gTCZ+A2VXsF1#z# zZpTZ<L%)d<3o<B3k#=+Tk0QEYF+xbec#(`qLEW^oW-*9HKLkvhy#Zxi`xkY@YIm}& zVp{vi88oREPK)e2SAwdO+~J@wbtbGTKPqHbeFk4B*wu7^Trj|#xruO};k?q~QeZ$N z0q@M*;xe7NjTI)#S7YJih>ntZ|3`oFpvlYiLKZX-yLkrc6X5;aF;J@<%p2QvHZB0p z8x)XxL<`F0I3@3{#YUaD!{4OHhcn^GdZ`8D=ezpU%rRO1U?Dm9Ycd=0{P?+#qUr)( z=Vre%v8>pMT=g`LS01S>``KN%VEEch?~W%6!|!p`mjb%>hR#zXT>iyAnPuzM7@18q zisJq|)%Il7HK*w^HPcNs5nfx}Hk#H9w7a$He#3A4Lul8`U8Ay}Z}sXLIn-&YJ`G?N zRWPHUHiur358pJiW5iBrH*>)gR(AfVEqUmMUk&s;s=g~Ek2`T>%<1jYc^X+bCW#LO zW3m00m!QkSPhjUkuLU`Pi2ah=S>W!-dQL@t;pgz&hu&5OuZIB{v>OmR2de#e-?vU% z<@-NKUuXS;YX~UT^mS-PI}6B#G~;uJV~Z=d@-Ud3FMs)){?n#@nHZFg*<}&m|Ni<} zjxz~uFvUR=$PWVlc>!`rYy~O)ikS9?Ne3<*sAL0AFW;!?4JW9-oWm85;(VoqFs-)8 zn6)bSf|AK_Ns^ZiD|wfg=`5m#unOtTMB!DTZmWS?aGEy%7K>3fk3Q3O$!n|j*RK9h zW+>H)g)WQQ*wtHzI{xvHELX#$2;P_G?h~veIT^uzGZ6Z3=9L9>P5d1-y7Xh->uZeW zln4&<enQY)ra>diyWlQ=L&J-we+J(zstEtkst`i#uGCo|wwM=8BDGN!24}2dH$As9 z@s?Rn8~ymR!nrO3a<tQTRcLzT_u+F!zR99%|CIL9gA}iCW|*ep5lqM~BlL1Fh$30v zSb=QgrP6UFu78Zcy*O^EB|k^i@@)&<835trnvDzbdEi6Rgw|Zk*>prjnk$wc<GY)Z zmoJ&S2aR3)!HuF6F&hJChS$Iw)s?Si85M5b+j8U;UBIkqTT5a5&WqIjsufj#G9G?i zuF#hwZ*r%jtgH-K!Acfe+ndf^!-B<;<zSG2_mRZZ$;k=u$DSVNM5+EIvsnY{-o+bE zJBjv>L{qoee}6~Arodo~ak;;X3a6L&24n)Cir=9o1bDwH@P1#qM9i5#z{1L^WsDXC z3U=azx+N9c?NW6rOiAl|9Y1O0vAl7Q0q2FCBoY~5Ft)%s_3|>`c(SywIKs4JT)uQn z*cT&Jv`T7?ZSKbUDn{%T=I<(It4bcC#O+_OiL$<$=}=;OQTQDf!41b&?pg}480gse zl~tob7ek!8=toG?;iP@!0COiQT{;Jej<?2VtqTpM6BhD&eD!p}AR=S%?}zn!MHpxx z=DVz#5x90x-QU5F@@j?OP{IVNR*f&_-&+okoa)gm+)=_(#wkjuq72&*BO1-C{(zJJ zZeholT&jQ5=w*Svoy^t18u<cr$&XO!F_>Q5=i<saqe{%?zT_K@yg*d;yKv~#YxoFM zgg?IzY{bkiXUZV?WBb`0#13c-P{~z$&}dizQt4q~pyz|VQ|?*8E(i@tRZrP&OHa%s zy4K;~2RB=8Iz{$PcCFoOIXpC<x6uj46m+Wzqi|$f1=K3dvfqP@cNvBB02mZFvM2T6 zhv@eeY=1;atQR#HnoMEv{#Wph`V>+injkV}lW+)im!wbu8_r<x?6#=eqh<c>hq0O8 zq~c6#+zJE%pm5tOiKsjb+oQ3ZA%$w6qKsPjc@RgBF*`_M_2Nh&O!L_y_6d#!8dN`1 z7R!1me=I}Ah7hKeUSdrQj=v(PAUo&D3Z63bAGculBUfs}kCoJ`pD}L4lEWYy&<kny zE;6&`c=_VfJ8<a8>pLOCe^q-ByB0VN&CMh?dZXLNpb{XI4AbZi1SI6jwWib-SnSsw z+1x`G{AIi~NpHyanr!k4=*f;ftje0R2R>8Mi;)J(Nj(xp^qP8m>llF}Wj<Vc+do)7 zZjZ)H?l^Tha1*_H7`2Z}rH;>dpKL(sAxGlJ7_c;o+|p_7>B{cQ!AMDO#^ZWdURw;s zMWOX+y|&g^_MwY<GB*;pFE*Up`3Pb_P~hFiJd;TvA(4r;Vkag16wTcBy*-mJ=tII1 z>VvPQ_uW;yw|+EOCqiBXqClH}D3wZRIBAyQ6X0#wR}h}HK@t1eQ?hXHOA2@wGV-CF zLHIdEPS!uTr&yG6dY|D<^Ths*VqzqWZQ}~9k+@n?mft|LjXtP+1&dlzX)V;J@<VyU zR#Xs*(Fk=dpR<9LGpJXDfcr{)cMa<WAu70Ukn5v}jI=~W`4I~3b7RI&GBNv)C$10y z$R{-xlY*GM8Jcpj&)gniur3ogFjBgvG1S;PouM?gRw)0%R95uvBnJ;aEU!2#gp%Sj z?h=HndmEEowq$bC8Ag#l&IvOADe9q%uI2>W$uGUhc`9FRnZ%zo0p2J_u|}v;ZbZqG z#Six|FInJ)QZHzlRQOdnke=$Dt?^-zY>CatHN8(v3&WICr%dU%^hWA{+n=-2CoANS z>bfgFe|~fDWOJ-8R8jI<=9E~d(_p5;PRUh~6+@k7R@h^OZR~|pc6F9}N~BA92rJH# zlo1{<iS+Ue`+Z!?JB9UQ6pY75vnzu8!M*7YRF06`MD9m!=cucnu%i1Z)&KgD_sQ|G zvoEMnbT#EiS&9YK&S4i6rW~wr2|8q~yS461D&vSpz5m!aVbQt${v3WkHnO>^=<nG> zu&1kS;H+YV{ep64+QngE=&aKY;WJ$ec^1`Ldd|{@scv^rSz;Ti7-uZ#3sbWiu|&J7 zSSLBcn);g0<d<-q<Xi}bH*br`48-vdFzr|f3a`3K*wDc)0JmIvKzt*TZ4V%`>dYS@ zdifieJBP$TIy<%UQu8Q9amqo($Y;-vw;K7`?Knnon|A$D%cn^r=6S8qZ5yECArva_ zK%+wZ=mU*$Ui&c&9a*e&P`!Nn1o~r9Mp2v=yn(mV=C5W7k9w%Q>G0_VJpLqRL|gvG z!(iypsHJjD#HcE;3Uvx2ZqoC*q`ZTg!W&MBr%rmCXx=YcFnjkl?ejB}uj0*8+fldk zorz`15pS)bU}f@`B%`2Ha=4?=M#3cT)d}HuE3E3TF(djGTXrq1FTqKCR0-TY8+eQb zhE45Ov7_>j!Jd1p%%_#|We8{$m1x!JF~;_4envDSQeP3u)|$wMY}toq*D74ld*q%t zhzEK~ZLrYnGUnLR&c6c>o$4-T>=NQknsSh|$yO*JN~p~#n5h33JiL<(vT_tft-wV8 zd}zqfJE?TuytSl4y~MaGZ@X!W7KFmJIu-aCh2_CH+VDUY#|ez%-qm0Xo*<*tFBr=s z^92&xHiZiKFP9BVEoY@9aVmS1?CITwi6=jCgIgqbFsP<@RzC!92rjlaoQ&x4&(ftS z8BKI9K=6Gbv5#)4Cx48#00Hr1VazxOQCP=9H*Imrk{SMSV?XJeEGN~$`F_?1_ALG& zH+=}l-j~X02~t%Jb$UT2t}j}&pgUQrYASN37f=+%9tq!91$y|*9|M6-Mk7Nzl-g{a z`>G6z#6;(1<>lcuZ_Cs>4WsXw)Cl))(sTWmk!P$Hl{ntkvd0T8_8w3BMo~*(x=~N* z-#j>hWvCs<R#Z&9iQQ8g6|%xx>0}=j@XRDJ+<h1mn90l^D_p6da4q$jB1HIuvpThq z8r6IeCr<YGrqPbP=QXj@pD!oMCL1e`)0H{BI6bZV37l;zGZ~nY#cAKxtwN1*zc2-7 z6EU|hU0fHM`jMDMuw{IFy4Y0yH$Ub|Tj7Ln=rx#4ienGS80Htop(2w-%J{uy#zjLb z06D~PT8Nl(S1Uo$P%@3shLtVU<61<I?dj0kvs}Q|mQPNoD>*wI14PkG!_02*;MksQ zi`!**O)68)_i1@t#@-Ey<M1k2C&l!lIio6>xP-<4%`plP;nbz5@Gfnz@mqCtIqgcs zU>DI`@Z)Jfu@#hq-Mf?I50#3x`>jc~WH~%!r^9M}8pDDk(0$!8$TkDm&%T%bR8(p7 zXr=GkWU~8BkY2kI6<?AxskMpn77Fr@9n8`l8gxtRBEc)|EU|nzmIXBP6{S(VbR{1^ zHz)ebRs8EEm5{$vp5ln?=c0R(syD{}*J}7*R>nm&KQ*7(d3VLRV@6-%a$bI|N_w?- zi`Xgmg9TG}OgvV8UgWEaaY>};SJ&9nP25{o94VAJ|CvpGe^<w4EiT|zxwd`3u6+Ex z)b^o>>%NygKl_Yl0}I}T*I%D3oR!%=P2qgbn)U^g_Fm61n?8Md`nRR#eors*)PegB zN3-@7oxB_SO5w|!M($Vb(_Z<;W?8GUFw70>yq06O^?k&Z_jSJe>Vv;e`&jB*Ryk{z z%&QC5rYCtfOWSWb`*F9&o#=0MZFSx2?z(1Xiy7^kH1&K`rgZ-FUqy?%m)yBnAN$+j zo}zQUko{!-y(fE0mQ9JZ%=p)5+<d10@XeH<DW@34)ZHy5=52cAYyGyr_;cisgs=z8 zR~lCD6Z%vd{?5`w{a1wA!YW-h1La5RkJ2{Jd^|_)zW=f3fu^0_dx8%~7!-!z%5&fK z@Pgp_P5xVSg?|IjP?5SeGdtdPrcFWjmGjqbB|lM8nRxQT8%Yzt!gX)A`FH&k_`PNJ zj5!;(GH>c=n?ASd&~$aZbFnuiUT1k&Z#i`O38NdIYKOS&$2nU6>?Ho5x^o1nssD4A zcDYmd(R;Hy`5k~8#sA(=@!#5(e)~=Ew+l0L>h@dS%e(4-_}tSTom_k8?v1Il+w(Nv zJN?nwWq5z~q8<95OD^fXZBQ)oO8MlNK5czaTiLmsvZ%h7tBX<?i#DB`S@l$H-{rF* z{5>vm;UyD0e_T{Gw!ZDV*=M!+ns|{(%@4~%lx1y9|A9lu^GnUmp3AY1d&1WyEjMLL zu91L_To~-r&H<jH2nI7sfmI(EG^CEQ23HvTxBr&9b<@++G8&)=@^tlcS?83{1OV!1 BZL$CW literal 0 HcmV?d00001 diff --git a/docs/assets/design/hybrid_kv_cache_manager/overview.png b/docs/assets/design/hybrid_kv_cache_manager/overview.png new file mode 100644 index 0000000000000000000000000000000000000000..ac80581f491da6235c53f14601b74288ead3b185 GIT binary patch literal 39501 zcmb?@WmMMD*X2V9NJ~qrDBazSfV6;=q%=r}(kLJx4bsvjA>E-Op@4Lkv>@F`y@&rh zYt4L`nKf(pz*2em#l0u?-e+G!)l?qiV3K1Z5C|Lvc^M4^0$C6KyorGf-yiU1Tp$oB zL<%yJnx1K!vu*}N>(_|g{$4xu=S)L+{%?`PCT3raC0l;{=+-$o`@G;|#O!P~eIPCE zySD@yLoJ(?E!+p^W*1K0E&JQ?vOaU4y=HAa($d%3$1F2Cty0iXB@bB_l-I)#hnfi* z43v>GRab<i*otEs2Bs%_rzfT6D`<@bu=Au*2R}CxOeRP3e9_&G#39!u!|=4jUXezu zDyS7!pXCibyDQ66L^btevU5mc-szqZOFZr!nG7y0y2lQAr>~~Y>h%T@LR89GQ7>2p zf-7{$&~~{>8H6~!Ts`>dZ6Elc2YDx}3CZBM#Ap_Ke2j9~`a3f6RkMulh)Sj9(eFEq zlf<e^Lw;n-=B8hUJp{T*F~&U*82d+!RLSPU1K2Cy@UqlmULHTH#p&A-=^682=}x?T z^UQdmO6gj34ht_!HJ<*9Tsg?pe%h*B>Jqj>tdZAhCfO>%Kz09}Z}X30BCNCE`)mqX zxZ!wG8r50v%xDR6*Axz^u=a0Bt>a4`=E!985bbgJgzUZgiolj><_cAu51=)oy?IM< zS2|gIflTkyC}RoydX<or#F!|&|J*$u{aW;J=3WLuF$5!!79()z-p%(+*LU!p*8ji% z_CJ1x3{P_Nng8PtkMM&{+S;C<|K8cTdcZ;wu=E|hYajh9ZFeF%+_T8TQzWh9krg=y zB`Jm@Nw{$Z2KCNtK|+2&$9+^O%JzAxdl-T2v@)iF{q%p@rdcuDIg&L$4HjPW)eZ9O zdJ05Ejk6KaU$tVOo>6AOa~@#?o1kLNS6g&cm3Q6ey!YGe?=u!GAJ6N_;eqtjF_%}U z!6uZrhK71VE<+w2LAO}=ek~Ca$+o<E<#f_HYq@a8VB5&2kQ009V9)%Zz);9VlkO@t zs!d+TK+w;9t@Fqm>Fd%$vcu4tQo4bb?yuF%9h_yXw{v3{8d1gqDlvzp%$D5qxhc0z zPCoQ`)t1%PT5D<;c<y)i@7oYd{+8l&Km77**Jvj-i5wGEXKwmp!Dl*xdS<1`MAY3u zw`0|qEO<dc1z%?CAB_sWKqLy{Yv6{BuKOwlAH|$FI=xM#F`0^nqt(vXWfPxfkESWg zruCmURb6;=t?yS3Y^NOd^w!LF%B~-emt67_e7~mr%JH7o*S5K%+@r5*=Q+Bh7%v1s z?i?o%Pi=*Zh}Zg6aKeGdh4a+syrx74gsi&t=QvOAXswvKvh)l-OY0oD>`IOGcfM4D z26LD1l87me>v{HQ-Dn9G1Oj5{7>O1IB8Pc)Z=nwGM4Cz6QBn6dOUjD56Lm`{J+h}G zx#R&pfdH;hx(e<0+kvz?I=Z2BJNDW&N_t5ds=uStPLUD&E-sFgR^+GNfnDv@daOqN zRg<U)eAGuocjy$H>jmOQWlWLV{pl34-U$ZaQ{WTH!c7V8z#5^l;V%lM<F-rC(jgEW z@B6;Wng+f&ej^jz7&vEtvlj?Dh02B*Bk7lYDTiB?ZcTFUDGUweriCZ;d2J}~9(PM1 z`ZGl1w6$H=Co^)5j}~8{N~Se?7gR)C*8dnLDVvch|2@@Fd?|G1f#*Em__9>Hn2+m` zlr;5`-P-8qr{M<$zZ?Z6$|v@1Ch?wHxLL2%di_PiMJEjn4LPAc{JS|`>NaeCu*$H^ zB}TJ<P+6nLn%einC-q^R<WP@J#RLbxFd;(b{-V}mvHF|W@aufGL!Q=JvJ;<AwAqtg zT(8;F^&88cb7gdN2Hn4y2_0^kK0mZst8qBTC80pi<SW&i746RPiarkz6<-oi#-Esa z7%p7rN-@Fu`(H;-{-f__aUqf0^DRE@!P}C`Cp%LN_?bfYH6BC7ihljrkX5+RuM5xR zLvNaRi=tEv-9vmSmo{znJ6Qr>YQpJ#F7z52^jhEdH2Q4K{G$qad;f!`q76^H1O`8w zQ99pMs<!=<$M)RLZtIp_>9MlSuftGjVYk1-4UezS>&k3uXo<pbHz(7s`8;-aD+(gF z<1~K`8MY-0ySXNQyv+Y)=;}FNYeX}{x!m3}!61xonL)y8MBh=+{5nUy2n~lO^`Fq4 zu6x`MykxF%{TB%Rk~|JJCP&7L$ZP&}x7xRD&K>r>b0SrtVP#@yc3r(lE6Qm++Mebr zyv%CR59s78F~Ik>I+;+I$D`!482*$2A%=I^yfl^fv}erT+10wz^Vf?AlKnZ0QUiep zg7@ayi_eb23*3Zw(uj*5u=t?+({?)~<;%v>A!RCRD8%|N2`Iz}QK8muQ97T<X(;{} zE{}foWX#181FyxwY;B^_EmKL+jLfmVUJk{p?lT4dtJSOLslN>IaPWMLO_}DZZA^N` zw#FTdinRlj<c1MkMc?~onNJt<hD_Iwo>*!!tsFgBOF>8FJ#o0}?PzrVOH{AgE^kC~ zV(EOY52cs2#IRG=me+=!|AbBSU#ZUS8@*ffxH}Fb6UP^Q6bUO+gdxXCx;pVhD=1$& zpFb9MbMU$xT2CwCKX-q5>a>v3XG^jz?)&ga%hXz7Z>xCin7V<dP&IYpSqDE`x^~gX z$xFi&F^^YH>I<py&&=rimJcLyw9jkoTrL(T`K@L5D%9x|o)YElB;Gn>%AC3vIKoa> zWLEL+?!gCinwb{6E7J-L0ZOS-<P4GkJ6y}2sSX3&I-i1V8%!qVpE9N>F=RS+y<egW zKgF-C5Xl$QZ3&+YA<A{^0wfBx2S3LxqT^CL8O|K^&$<(g^vAUkUhyu5gobXG1|w0} zM%aGr^4t2j5WjV9<3H=Rb?F8aq{NJbi>9z#H8#VHtGW~Gzm0@Ec9w>6lQs1c5|R@$ z5~(Kg<C0K>H0X)Kby17UY3qtYJkC|@!!bW8#D0p^{JByY`P{1*Awc<2jgF0--MnIQ zyU!azyE*Q~qwLbqJQSPlLXVr@syKOZFu3wJF`CD!I>lnn7h!B|K5A(!9xKy;U-ST4 zpS)<GiNuC$?9VVhswOHSAN%WHZZ3XF_Y$92r@VY&zqmfSc=1DeOX#G!oPd!CAAf?4 zO=xj3Ih84Ib7U~?IdUr9_s6_eFI>(y&axFJQ>({yldYW_W*RHn^SB=$954|q61u;f z%2JCR<+>Y5s7H<+@m?X8rMu7C%D!N<ko5E1)OG5;#9szU;y&7PdWy2Iio#Qy9&^vz zBv*5Y*J-8xvRKWh#63ByqGv0kZVR6AjeGhbA?8D3TtY@_jwVA1xh^5gZ$|;TqNhdi zrWFej%$83cjyen(<`q1mASWk8_3G+eImgSGX)(`>sYo_Qc@o%Rv$#;>{&!n0qSR9P zqubRoaZI;FskS<iRZVVl%E$>;ptr1=8Drqc10EcK$OBJr-Gmo^2B^Q1?kLlUYceHl zin&tKXS$rUo;W<G=6F#phoks#wDI}CVMC9F3i%_VF!@+YUK723-<C<cUs3Z>jUMM= zp`mv}`a(VT9rsdda?p-!mbT<jMr)<{k<s<N9*;kwW+EmdzgufJYpD0|uCTS@i-~qI zD*bXDR7CHEb%OyLHy2+`wap8QuePg8Atp(sKGJ~*cB5<hsBJdgs`bv5Vad6&;Lc98 zV7Zuuwv?n~Bpe~Cci$<wU7iFrzdjCJ8ScVsbKH0rnw+ds(9`#8V*enfVYt<6|L-LP z(b`%oFV||RPMhsuRGd#*2?7P{N%7vi-!&Io3hlrNK{h&Bgm#hyzEAy~kLr1krg-a~ z7F`$MzBTc5w@4{!2nxu`lJK`QUw*>GY>--y;;)zIn;N&ixvCOFgos!oi;ANE{*Z)t zmDlN{*2N~XFn^Mgg<F*QK>1UFhf6G-n^5|j95>mZ#aEj36`{#>pFgWSwyvR*#OJ{w zLnD$M<mp?gQlM(~q-HFQ<vH4|a4TnGj3rrJ6Kdo!srYTL!WM4KTm4dg&{fyTT1(6P zscPMjse-p%d`(1pka_#hFs>LKgNHE_f&vJ%$-8&&$4Ht9_dL5@<ls`-oQlqnfJySz z+15r^_ja&cxzA-8ecPPyt22^FX<uL6#qc$$nC=2u=9qW#s(qbK%~>_P3RhpB5k@@` z3GKO4;dVJCFgE!2tvBxte!e`da>YQ1P-l$>sysG@I*J+@prL?_ki<8jE{xag>mq6A zpvI%u$y9$Mcd#vFK>gJMN+7%<jvmer3qldGXu8699_6P3k^>`6;rOVM>O8XQsFJ&< z{!2S-K})EKSc(r7hQBP`;qlj#MMi`?`7KZwPZuKX_$qT0Sphi`NfzlVazZRBf;xlN zEps?VMC_|+fZi=stWSBL64^~l?Cl;BKQwDy7rZ(bbRoH>E4@ES8Q60x47mo=RVcRS zAwr_`(Y2!pqhd&R!5i|2__=fHMe+#a{Q$YkY5TDI31QcQarxGdo{r^f$JmTIcyMC7 z3RUVIPu~f<*t*xtppA_ZcAt1(Z%r*%<7|6jLyyyncju*TO|JI2_{xUs&My-J`Y%nJ z7zi>Id8VEZy~zfeN$5P)FXvzNelNll>@!d+K4aC4SgotX&t<#8zuoi&>Tw@%$kSWS za;poYd)9=7ga?VNc>D}*0a=j%Y7pHh5gVVJoScwwSNV;c^i!fB=X)BTZ(zJU)tS#Z zQ{V`V)!hGz`~Po&{y!;{OzS__1bS9?&rtufx4+9BOCzP`H?OgA=oBg@|Ab?moenf% zN*=Zk$uJaYYcNn<5fb4LBo7VgWbQ~rv*QEp$fi*llfw+E)W|oz^T+H|QU}$LQR@e> z;tlEQ3iJ*bSOaz=&GNgRkI(zV>4zRkpLaBG+dH%4oYN^nabykt^P&GRv-P>5vgT%{ zW4D-SRbwl^5OF90(W>7IND{T=8*fdl3sWxlydAYGj-ZvFWoC(^ssFfaI-hCIO~Hqx zN2NUerk^e@`L8q?8L+>k$hUuTXGVGhcV0K?r-mO=k?x!Shh~*DE572RQ_iMMx9_9S zCBuAWhfr$!@vdhD1<fWwBZ`wlHBc(Ak3A;8jg@zKNm@uSl_E=$r@P>WS&d-*&|P64 zQv8``gTs9;<)>VL8h*gU_CDYGWYu0FUCzO2072%cQ(uHMK0P_p{#D5#hSmuE2EY3b zZpe%F4|MJ3g_A(rJW8!aD6cbPt|}N;Vfftg_qNyhhl`MmzE-CpKp1@)Uw5@bMj7zq zz|i0y=+;1$oO?X%pAr%M8R!2a%lm)lhyVNEe3QTKx2KA+-*FSV_y|O)c9Y-rRbe3m z;`g4X2sIueJ158Q>dX=V`pzHx<v8j8{upkEG`>u+cIoP`^wgM`n1lpghq-2de?%ay z0A*xrj&!stu5=%eS`PWx!-E5_y+zr+J9v0er>CBQw1;HGfg?DY!ZP^FRDrZ4?1l#C z-9vd#)Y)WO2@2gI#T?{g?M6RoDJfwiJiWZSzJD**0pJ)!>xMLndk6IlM-pHB@{rKQ zYvEfE1L=Juy#HLvN6e5~uas|H>r*QV<lr0Me?GFy&S&AI?9MJOZbDSSCjW3B_b1T= z%E`%5Q&S%wAOFKeM@W>7tR`wbW9H%Ur#7Mv?AYM5LRN@n=l-dXu3}^Y(e<w1+KP=} zZ}J~zPVMIA=J)S1p)$ybKZJ+u95h0NgmD&qDZDRdbmkYD@kVjmFgrRrQu*w|-oGab zm9em}K$XlKPrsMg^F&ec{rmU%su^=FULHO{>N;$G_~~&*Wq=UuNCJ@ZyAsjcIdH=f z*gSd~Q9Fd#{%8XmuH+H5S&VHiSgCTW_kC`;Twh&2Z*tet)YQ?_!wi!03EDF+EGuJU zWUMGJPvW)JK0n?%qWa~aCUmhc>bYy&`Tnlx>%-Hl^PTkc^c&UWL4qReQG?I9yOGhn zk?&dFHVQ^~e-mlk`1h=Ukhr{n=&n!bD1yl;sy?SL2V!WS7RGXg8Cep!zkE6StE+L_ zEJ?`C#?|$(H<|nV=&z-*vDyie_bVo5W_F|2X0MZNw&#s@bIrQ25^shUNZIJ<UVM9d z`*9@Y`b0%et69-_yy8mTye~6BXe5oeVd9FIZ_EipnX$xURV>0*J?S_it;9S(5)8^E z=iOMvl^UbC=~ue$yQRZ&wN6W&35kgrqF&mx_#@^GMkt{&TraxuNl6n$y^j9~lOT91 z4|}xr^n~oEs(=4Bj>ozt$>?wMK68Hgayybrc%x*XB=A%8#KqU=Daqt}#7qP$-^67! zs-<L8$oJ9R_b-lQP2@i2{tBC`vQ)LOSUvhXuFiHr1pD=JuKDcZ?7*xmavkmxkK1PV zcl`4Xgld!f)*)55r5S9=<0$IV!otGT#ci3DYR5WG&gCNB7qjuFXBxKhgeWhuPC8Fa zoj3Aji?g#AcNf}WlZxYpS~VzfEyv5A!>W~*m4Q0NK;_#&gxL{CuWHf%v2;g5jJ~X_ zOgWXW#dFV;Dqc%X`}xCh-kLhKV)eY;H-)_MsVza8lB<c`8=3XR&pvbQga|SGwO!rW znJ+hK7xvixBa9DEYXBR=!^P$7>N?^<n~{u5B_!f?T!M@FlckW04}ZS_HR#`tIJWGt zH?D~tE&SuBH;dWX*=jwCvsSZ>ZZBTsw&qBz@Af35rrMgCX74bFiPzTFuB@!MsYT1= zlVA?0+__Ft`%?62Y(!Gh+}s>~=%$8mYG(FSLjyhyap=76Dn$AC^Nx6@X@z_x+v|OX zluiz%jDs!l$7IH2+Wbe>v2=svZf<WnOxF+VeoNyVx}9}M6<b^g^Tg$ei5~8cxR3>} zW{t)Ooex~z2rH<P_^6<1!FNe0LmPtF?Q^z||5aH$45Xc~L%vczpzd72?=`Q`yX@=T zIzj%zf!%(!b%GY`R4?OWMIs$N57p~z$f{=QABW<WnMuXolyo)+_LAf7JYMWK4yCN< z#DotE-;EwpV0YnQ-zEt!oMn_7)-;jKQYr3m3hevFft_P*Ty+~4lJ$MZx7#F!E@B~X zs&v|jzb+7%a+(;+mkWNMcdu5b(@lCNA4`|{{xnfhQIRL5kxca5D=#n4rbdrl_AixU zh|y<sJnE9?O|{1{c~27OT7Aw>ENyD0a47hiUmt}voYSBiwn6|8<tp^b8n(RVg>cW) z5cE3UEH5ulaXH?ZZ+r10Hl^t%pWXhBEwqP(hlgJu|6^oiZ1FlNOZ`ERK@@Cahg;#) z5m;`tvv&&>VJr}7QtmvM_C2veWbT^fh?QW;GKyvb=!e9+>q!o(z%Rt3m2N%S%d0P3 z>UYG>?fBHy4ga-!%*THqQegVIszmWjPveD^Onkxz6-v&`+<MQ0jSi18Ba-mj2>h^j ze-nSPCkrQZJaVKY4G+DTxe(@mNi*vwuyth1cJjGZb;3?cPfPHXYg=_Z9rX_P)#~87 zYuqE{*L#b%5Cc4-7QIOm6B9X)qqgT-npV@2l9KK`kQ>RZ-q_eUKiQ!X_q{+VsK&j^ zwLDg;PeT@^SBiW4wxHX_$Wm9-^`z0hFnNT<pMfj|1qFi87f_ICuI^<sg#EinB)fP& zAK&dqL`>|qGuJZp7&1CJKK}5W+d8GJq~wJ!w+&B<<QudW-^+hSiK*%7KIg~wfwZ|A z){vamDXY#6{U1e(&n}+i4G}YI>*(lkc+}ZXk7dh-Lrqdn;<(4gw(an_f5jQ<`=?JE zruoP8K}!O%Lndu=?A`t5RS1L*OG#z+#aKcW$Ym4PE{`N7B<J)a6#FcY!h^<n<QE0L zrhPF5!A+g$VT|S<SH0DFr?lVu5|2umSlN~x8;FTlqPA!06k_St6&{?gwDwih@hZGZ za6IXh<wM5^RQu_sKhC<acgtIhjLev{|KrZ=oY&Q%6PBgw>Cso#0DP1$IxK;-zE4JP zKo)A`5)Td~Zk^tvQU>Og@_>%l*Ob{}n=5-rBQ{476#Pk=d$S(%v(2xD=jO!d>DQKj z#PP1N7_}NOB~(t@A>Kts9`EjwK~qWMG!t{|A*#`PuBNto&$9LP(a-7W^J#}RDnaMv zeC5=uQJO=FAgP$d#I9amCU*AgU7FUeD4Gmj+c7_u=Q~gtUYn2|&wcwYl@;B7W^d}* z7ZT9EU^i1g*wytoPlK40w5y{dFF&7&C@jU~k78bIgSwaTzYL!bIxN~toe_*-8xSRV zc`HBS=$$Bc26G<E$e_A>yF5QR^S;Veh(r7y$dW!->&L^z{ng!_#AO~<RK(~KshIbA zr;_(E$e@Wmy!`wH*8f__peL>Ix*8S_E~54T`pibObMR5<u<(WSnBPGi_)`7Klq`6O zOcwd8>1|n4Y!slp_=`%d_Y~6AWXdO#G>USq@hQk&&NTGQ4eFL=zrBy#F2VLJhVBPx z7%K7|guTh+mw|ylAv>KdeZK^J_iDTSrz>oCauX}G2sCKlt_`XWG5vU3<kDbJ@4@_{ z^R0<^uP#CIp~cVMTVoMhjF{ii>NSc+)~luNNDhNeX5`<q;!J`?h%wg}#LLN>gA}j$ zF!AvK&u>&Vq6I(%r5fSiGamP61NSH`-Ga^sjrQvIR}?@gD(OPRL_{2=q_VQI{Tbp$ zt=@vcCh$2bAy+aEV+lRIsqNXOmX?-eW;LR`Q5##MD)+w(%EP_-mn|oP3b8f%Tu@-9 z>l|qC@uO2xULXD${PIO&wO*ybYbX6Pe17hXmw(PSN{fLAbY)njL|Yr1MnD=^SmENr z$=QV|51boBxVgERm=u{3#wbgu+B2T#wo0z={yjS1p{midx3Kt-nksBHkSQI4J2^e= zu{EW|mYA530L@sHDFF=~-RspWc!T%UqK#1SG>SCu$Gj)w{MOmoIX^GX!&6;SVh6?Y z>({RU?Wly^4Qgz(`m&2gM@OL^`ko(yI-GC)diVENXkYj4-7{b#MKro^rNqRj1C~M* zsAg;r<zYp2_5Ape3_-fL)CCBKo+SKaYkF;GhdoAKE{fXT%xpMY7Rw@CI$eeT<&1#7 z{`AY)M)Mah92*Q`<XsoP-+uP&89#sB#DvbODuuXjb7^U5NJ!UYmF3^V(5rC{T1aBI zV7X2?sow`qMY;c!r)&d_NlBJ{gfpL+%L*ojV9B0N<+(&siI`fCaxigKKQS(@Z9P8l zOHv%x3(MAKyI9!Vi+kf${g-^=&~2zEJFt1d$4g6rPOkI&QuO1NDKB)Mn&$$xf%0@D zVW@hgA37{P8wgzPwd=1r1nEudXlY9tPyb3FHeWAdye~G>?A(|fl@S-NCE2=JkW850 zt9Q7RoorX@xwmn2*?Lz~E6<S2@})ja$VdAY6&f583@2(D{W?1*VWW_%=XyaQ!3Ulq zVn+DHccy9*jMfn2)8l3Da4`6=^s79bnpPuc_4sF7%$dWV&lmJ=xC)77v_WZo_~gS( zlSlvfI4MG}+Uo0yb8<o_n^CKsiOC?WEP{;NLJ8__ox}V~W8>X+Jn`N%K|+N4=7foZ z!_MMjrAa$le#9Q+w!6Fg_V)H#e+D{2EF<IN#~a`V+mSBp!L>py;yD0e@Z*Q|_Dn-B z-aygQg2sjh2#MbAZmZ3S3Z?6BSC{8hl$4ZIRJPhkQ0=M!9~T!h57Sh@Ble~8L&=~1 z`qc#52qHQuiHwZQ%GMS_--xmcb~Gm^Cj^%Y6EXhT=%y=mMo`$>+taIl?4_%P-xd@Q z5D*srtY4>cKgM-y%I#pakKM4@(b3U3+uYRA*}2i{gbhI{;Ap%xRrBNni<7Hsnz*0u z^H!Z}><I5Hl?*Xa*FOVo0w(a|*(Q(A4Nqcoi;MqmP1nu0`d}eUE0SKR8UY$$XYc>{ z6M%PdbMpnf3@$EiPa?bD)kOpJMQm(W505tIm9V<Hljc_kOiWB1*e-5vvo*GfEiIQD zqs0;LA&Fkg3?D)(rQos3)F^@_Lm>G0_#j(&^O%V+y}ZtwJa(Y?qT!H>yxM=8zkZk7 z0v;^Oy;mr`%znBSuxTRO^Zk>P3?6F&<zaYB7gyI8Vs+0x8?rJn5roPB#R|cr*&Zz> zJ-GXk*Y-m}!P;<vD!o$DK&B*0joziDYR?;Vyw@k&m+Tpbev7XsQVH+rIxw$S-=C$q zN>o=YAhGhk(8Hk;G}+8ur#9>>CD%JRMaJqui&40Rt5Wu$Q>7?>J=(I!4)<hewosP2 zF02C$OWRJwdsXpGaD?S$HCI9GOFLT!W2<J*Q8fC*d>9e55`K7LV-XYolq&gjtWq)u zB=(GE=bollPe8Fm7WCm4xeH2I)HTDZa%hSQ>qrY@Rwa`aDSuSWH16`V;W55rVPC<V z%=MMOClgb`dBoP0KX0Blm4K-&g__5%Bs6rA{cAzvG5PQDPo-)3<1a;xjg4J*NAhkD zud*4ou$IbO+V05Xmjp5M`wDNLe{oMQwtC$WEUJxD+CT3t;Pur8@lL+C|LS0E+5Vx= zzRB?@y~D=uTO0Fj*ANSSe$AggeNxYnLqHLGGj&s+_+3_&Qu5yf27X}Gmm7AqvMLM+ zkOT;{HO%KQ*Sq3;vNJF4v2F19@#9#!43Ss&2|@$kyy3T>;<5ZinUde+x{i#1otuUP zfaq5qj-nRj;^b^_T##~f<Ullg?%jXzpuM}B;o-x)<Bpw~hBr&6S<)ferF!NL4xb0D zer1UFFYT#(6cM!il?EhOoro8%i-2zXGoJCOUaeh`8pF)Y3;;VPKwe@#|0X6U2YGxJ zzTL94v|L3)K%USDIDQ!#;^OB&`7@Y97Q6+e@@#)aTU%Q(51z#7cym%Yh3D1ju8E1s z0_3&;<*%CuDA8H?{X28iCYcix@Zv{tKQWid6N7q3lZkSZ>-EupWCH{;M(h<jED4IX z+aErBP^2TEr>D>Nu~_?+4&b5b^%2Ob$<P6S#|M1*{yq0eBHN=!k6K$>A@|#Wkpai6 zs;tDu!NH>uvokZx8D3{$W&Lc>0R4qfRkfdlRqutlIjdeZgNVpQqLFVhzXNq*Pl7z1 z2(@aKT5fJGkHcIVV|xD@zwH<c3rk63<5X{N2{bJpWpST>uKy0#Ys`H2mlX^Q4BXvo zi;C9QhYOw?7`)(`48kI1Gi;uKtcB%5KvPc9E(Rw58#rB9SXgXa9Ag;f?c2K5R)eq_ zi%Uz-$_#Xyl<UPF=&(?>OVP~j9a<08jQg)%`L5(8=#Q~*an~8tzC4+@_k4|2*X3%& z1dn~5ZT7;?$I<09iLvKHe*qFhSdCZDC#R<tE&|QO6z<m?EBZSm_NDgZ`FXjaI1%ra z-dd_s+N^HXy&lHolW?m!{?ke&fX0*;*tIk)Q9##bx4QKX@R54({cq#q9FMqqe~d~O z47+cR73<-sW{CV5v(2oy-ISg7-AT#L|6z-_%k^=B+1d`yCJ8<vv89!jW@&B|^+bj9 zaQ_22>`hT)+M+i@i;Az@j!vpb%JljCb}D+T8Gpnw{o9UsGOzscl4)-x=bYc7SAQ%s zD^FYhGrGAI4>@+b4$Bx+Kp?&bF3nF1RA8rjdV8VZ3wWP$AZ8%Kp*F9suJW!GYJLI@ zWoy384_X1ViM_o&fk+3490)durZU0O>}*pgQlQ}|s;UBj%xBmDz2kx0do>0kC}|%B zoJ@!Fl~1;3{{oWO*w7QM&G5T=HB;{dOa$HEW$o8PIyy>WcRRS=yzjYvo9`un1T7sM z`C(W1^YPa7Gi~h;A3y%$Ip(Sndh+DSYDI4sK4Qd%H*yJ{`e1MG>})M#9-bJ=ETp|^ zDj)O$TPV&V)cx)44;UC2?%xlbe0IZTcsbKVz4*Af<MP+xy6EWWJuCZgGoPQNYvMP^ zYGr%-V@isiy873WLn?CepQaUQX=&7A-d5Juf_4)e2zVob$iJ>aDfzZ&Cxob9H!+f8 zViq<x?X0XY5RehQ$vEUZRx|aSh-!y<Y6R$Xlw@QA)<blGv`MP(lpY-%9Q++AdMY9D z1!_AAGICN6y!4GqrTS6iu81c`<PwoABN66<2RSMJ2mAX&!^2cmRLL)i2?+_sGOkzq zD7d++QmQc#<PrJ6rh4h0mTCin;(q=S7~jrB#S#R0QvaDH0wqF^^>dkFgY`~#j1RG% zan(H$9;7ZO>x5j(HnzS?{r(KG1!bFkRp#1q1V&)T(vlt`7A-(cX1H2A=MDv#Py-KR ztW+^ysi(K+UD7C|-vXN2@ySWaO9&=%9n|oAYjTMq4FQ3sBPt%#jR{Git>l<_oZ=Eh zDT$E#dW*?Q^UttK_w?61Rt(xj6892Av9asz@G)*<zuD|<-Yju7BYb*d(W;0*d0>{H zn2^CfYF+dE_2Fj?L@-ha7M%h)8F|glOcl9Ao@OBtrf$Ng%E}8Em6rxi?Fe~xj*n;6 zzt30rB*t!1FldF1jZMO4aJm#F{%LeWHk^dB3yO55W&g*sgEfG^4fXX1e@!Oh@D5%) zC@k^vbY#J|Zrwui&&tYDW5{J-5*PP_0#_6M#i`k-%@;u%_v2G>ah`@XV1CeVw2ML1 zVIIcTH#9VC+G#5<hy7Qii%r4qGp(rBE7fE=+TBGURv+|Y7wjJ%q9OpT!v+C-Ca0j_ z#w)suk~h4b^yu%!#l>o0>iOv@0}*Dr$}?TvqCqPm*FOjZ8XB6nSlc5+f}*TCQRGrW zLIPykEyOBYunD+T7}Xhw#@0%^oe-JUjTQ{37uQC48byDusa)00)BI$LG&{e4AL*;? zZetf>V^d+59rp_e2!J5GNg{3SF%Y?uGNQ?XwY0U#$;r7KTmvMclaoVDfP$6Wl~<0< zmC8c+Ll<iCKHG<SIypHR8XEf9u*Fnp!uI4?SU^B2g=f7x<^kk59|s2l0Td<f#|L&J zG3%k+%*;$^0i5h=0Q}<O;@aEWLA1P$5Qs!aLu1pgW#Q&N7%hHw`R|B=>xDdM78*~V zN=r+-I6H$xbC-t30R%4{9nZz@p&*_?otbU(%lMI_v>rjutN7%}lA{1{6lgA3BrLGt zaDOqUMVVQ#$!oO804cU{D;}T(38QMdl8VLt9j4Z9dO_JKw8me<`tY_5h$+b2cV_j= zZeh2BOQE3or-Elpqwgf8QQKJOqenJ1ily2nWDLu>+Bk?zO@?Z6A=>iG(&9C?=gk@E z$sQ{gp<!VfVaO4B4~keaINW~4@=jYSQMS4qP{(Ro^Wx)=7(Ob+^zmFvy|YxKXIzo# zHqF+Bw~dSC6BCy#=rOg7^gQspX+?A6zp0L=x(-;ehMg@cVf<Vqgs4tdJ0D`Rv$?+( zL{bUJ4Z6D3U~>!%8k<aJ7ILT-Yw7Fo+6@f+>+0y~K+Co!wA`Mo)U(t19FhcT%h1pe zKvttBcl*7?4!|E~!JJJE4Z1oy292)PBZca{$t|ys8UcmK%gg)vUIAbg6%`E+3sX{3 z!le;2(9|TNqPpB)d6JS3XuQsDQUfFiKtxb3k^E)uo0J(_TQkP}2n!ESPfI)Xz6v8| zhDOT=d=2n1i0^>ePeH&93JMZ--*j|xQczOb94nm$VBzfSTx&m_KWOFc<1;%u3)Jrf z$|`8Ep)&XL+p~`oUqMrll9ujI7k;Lx=?WCMxmg6FIpqxh#|tit_xy=H4;6rzrX(c+ z!q3djrQ_n_Vq^?Gn|7jn#z55jJJa9azdZzRZF$*lzLg3A{9~TEV$Kvv!qjx}{ax;t zGxhgK!rQ-n0|nRz8jSUDKFEP~U;L4JUxApBC{KruOKH44*XAew=uv)sy(i!-za~)& z07w2vC?5eb1T^Kng%=A70s$q8QgEohe@FbnEjl`y1713j-6%LHC|%6wHH3Lhi2rp^ zY%D>51T8`6)syuj%BU`3SfE1noaW|cs}!J42R{>-k-m(KXwngYic1m>&5kKXzPGOr zx@_;#UJ^R_-MgF#wLl`G$az0O0tE*L0}%m5l98F2n}^4HN8#mkZ7JZJi3+piCesQX z1|sAyumj3UN^+5uub|t(^6KmBdu&dq)8Yeh;;?)<Rc#Hucy4Y^AoBd*zpMQo_KzPw z&ikA^2XG425l9;(UqnO%KlCp^n-OwmR#u*ef7Gd?#!L12AAJKP{QC9lN}ZAik0W78 zSK-!V!Ds*e)!I!~s(f6!Ms1fYiY|_Kj+yY!M<5t5W$sr@e3360EsFoWKKpHCW^sBa zAM<U!3i_jb*oCX9hXWyWKR%RBUsl^i$mJ{?oUxFE>wc0eeuEZar-xSathi8RRwyE! z$7QPitCD3_L`#l_Bqpj<#e^DDX-bPDNn#h!VZNv?8va+!2?_PeAusQ)GqURaV_xP7 z{`%co+#<ICdwcQ&?Yque_f&Tpn|Vg7nRs1hQYE9dG~1m&8qZl-nVI#Qwh|}f9#c=Y zrS~lL>sF_9Skz%|7>|C@V?zgO!>U(1Xv15~xBYLS$plH+&Jo+;g;|}=Fm-yZVbqm5 znjsW<fCaOyK3?D7-4Tf79&LhHSzcM;j5zL{h1P;cDS+SQWM+2LaNYk5+;H)y`AUM0 zzx{J^9)z@qMnuduxU4~p&CJf~vWx+`w@Lw;`Qno+Ky^?9VF+c|?CB&W6_}bz5xE3Y zFGv(j3=Ewri@vwF39V9~;sU7U9*(&NF-lbxP_pOd1Sf7t6=>K<C}^IZo@96L!p(6( zE!E!AO9!_2@91wb=tyNCQE4){xw)BtDhK@5>v|`U*3Qn3hrACy01CSI#i<hj9}u|S zy?Y0&wtUhK3k%BvB^sc#nBP@e=sf`evM#4UTAM^22NNFUkf0zODj^<rc6M>`HifuH z&@QKIY@b24fd+80If;UJ`0ybc8yir3azHQ$r3|r4;3mL#ot&JyBB?02EqaX-)eQ_N z@$nsDl!G_<>@`r9RDOq>(6zJMgw_aweQtEs3{TgaF3c|`CidXL10kW;5Z5&IYlDF0 zkx@`Adz1Ez6vsbvTmCWtIO_0NJ(dox^MoXPv_zMC^_^S}BO99qxd<~8ldI74J|YP@ z6O$|utNPA`IXPoGOnI$`-U=FOYg^dcKlgI{9eq#whOSVHbkArNX#z0qHAY2r_xAE| zbDJ?m!rvimt$j-GfUdf<XVdm7HZ~SS0S^y$7MAMra$1b+`g#F+`mW*O19mYtrHnQm zmX|}hKa4~`DZ2;h-VE@}NZjFiHe&ZZ6YpSIL%QN+S1GALBr|bXfvS`qnj3!TcOqFN zV;)%&QsN|iJ`T>6G+ePN`^#9bg;0@?{@D!HS_w+p4~rPOZhy;3<_~_Om&eM~&Cgv2 z61S2WUgQ#4WQ-6&7bi81s)9<KruKwzqt%EFA-79PwxS{?D+@)X4oeos6R-5XmAyXJ zUL=Y8F@LIsu%5V-e#o}|?TA4c@;&i3U6Jj^!UH0ceVeeZ_NRt<KZ}b+TT&B=NHO2u zcfoQ}mo%4q9hUy)X}jDi<}KNB$FN+5*!viiU7YEW%UBq)HOE>SB9i4Pd<P58lK1GZ z?d%x;r=iAs-=`EEn~Vd<t@!5OpFKTQ<>kh&Gkbpi1mjgr4KIQr;peAMH<HX0ecP*p z{L0Fqo}S|TuExwvT5<9D@$qMm;*k7dP_A7Z5f$Yykcs4yE*_VZRH$9t(%hUxbPFLU zC<wD-P%n9L3V<p4UCjV)7#fNd{AMsSI2f}|0<#^H3Y{JM;FQ$V)QpUsjg3#EHc$&~ z%lfOw_^bvH2q1~4-mO_#ChF>gAXxeM_#`*ng{xo&Wq<k<CWwPz2!p<83y7XjT37e3 z@^H<$+Pn+2VW{5ef-bA<SI0uTWjh0%o%sM|fXjet_F_!?&t%nFC?UNS%<eWJ?SK@J z@mS#_O7!ctVW6p+A$GL42h~EN&%+8}W<`ZlpRGdd3uoubH*em^(?Lf>l}t)bhA3oA zP_(pMOIai&Ab2Zf(igb}4Na%+<t#`+Q*{nEj4Xv4epXUaV$&L*91bFDlJ!0tD=Ucg z&_DsxK^`iN($UjD&Qg=7BdE}MZ;}KHke*HzxukS-tG&Fsy85Kmq{FlVpn)cnGLZGY zKKMXHc(@Bla2t1T-@Xk`Z)`YNS;+<9HbM^Y2gS`J)%{un2fjtFCk}2%`?3_`v>oTI z?c5=D{<F+g&pxxeP=+8Cc+Fao{JWM$;RA(5SjZB6)!(M(u<#H`G{0&dR8&`&ob6uU z70cM!*|{EH8(}7r-8&6zee;iT`S7|<b67xfLTJK<H&cDM%Xf0JW(|S$L{e5zVc7VE zot@1~C4tF7gblH>H6wQW7UfoTN=gWd?BMD3yc!b&DL-i%&=C$qMDTZA7JXs;nv`q? z#YeY<9}8=s>#^RxA!}r4HoT!G{wA5Du>z4^XMF9zDGNWuATl-7%aorxVQVs34HO2b zJn6~Jtq658PSqx7?OvfnsL?5_69ps5sw-JWXh>F0*tKc<^5EF;G7$TvVEv7nQYdzR z#*@*1eTm0KPhW5n$zum~tUVZ^Hq86WqDnmXVaaNhcKg`aaf>!T3}c%{yh#0a##wWq zV(j=h-!-kM&p)qv)Tghj%UPFfRDqzy$FT8uu_*A%)sagqaqcxGLOwQLF%MYA%Zc&@ z7=<SJ{eh+pijRs4E+S3P1uPR{N|?G8rX4l5<KEuh2xE|8ps4EV>Xz!);bLNf3V98b z8A*aE!Ts-8DR2%DZn1E1fF>aa+~}-VS62YPm4aFzra>D3F?DAktNmLL7E)$;`EEgm zuh8p525wX{2tT+Y8#A*lh^^DL_CDU;N!C-4mOw-RUE0qz3s$}Ti6?%+0T>5hkC&Gh zca{5%9JRIvs&x~nSKJQ57HM>d5`zY3xEj!R_~hAQM>rDHfHoLxNJ>f~At6EE=;`aD zg0TyjDWLzJAFdw&%>DWECrs!$I5})t`jWXVZ`A7;JOhix;)K}PA(%K$W#BBg5AtZS zKCor?lO3+slU)?hjDLnQ;Da7%EM(3BzBi51`^1<UMW<W$Y(}p|D1X<D$PZU5$}#~% zkhbAjZVE3}IK9^TGRmY;aL#&d&2t@B$PlK%Bp(!5B08;?M@Nhqn=K!a@?z+t_LHA0 z@JnMNpN@%>DCiX65*m^kg{d@yG?dV#$3P>4RXYFsxou3-6=XgzOTgftnVA^~RPrDW zU~AaM2i)|*SsGwDC;z^7kdvK_jErn*YKkz1SHldt&&TIJ*ZhhpA*{m`%mX*|2lnK~ z0Kqi|e`qjxDJAvoO8`m~l`t9t2$_?Miy7u8PSDgsWe5lfxj`3$a2evcggL?P*~QH; z;MOfppXYb)-UVF;oJk;@`WYI|fi?jE#}T^gyLT9X74qGn<wdXebZ~4Qt`D>7*NS+b z?gGCf#_WK|9<c%6!;1oy^zHe!o$c)_aDM<;!UoL1$M?aSs}$-eSY|nlzW|50+0~3> zJ`RvL#SKQWiw0rT!2GPt+c=SRKLEUoU28O=9(6?pbsFlQ3m(#TuLa&ha3zB(C0{VS z1;Mc~Qq(J6KDh~kBB<=S&CN4&b04jVB0@uxxy%)}x*p5QuB@#AVVU^ypRXT36qpjk z0LnwK0MZ5{tSACtoC%rxa(4^_WLH;LL0%pwYmbt+3%ocCv5;s%e{`5{tzj?IWRjMW zf&s(U=B9|rx3?f9gB=01DhSn%A3xTcc3J_%P%FgR@p(af`uqbaF*`_tHVF_40KOz& zpjhbKM-3YrHu<xkOG<iYXMG^gl!WbJxd3uQ;csYY$jZ7mtYZnuYi+GCTn!3<CKC=0 zj=A2;w)5>KxMd7osL2G(?;s*ZYVGyYRZiB8^83lER^wW8PLWN>EbL56hShgJ*SbpK zFN?Au)R+>W<b&617y^MY?B7BNO_JZ=*`^c7DH;(^wYUeicqY$^M|-mkpad#mRVE}5 z2b(|}1tG${j#42uKR;hdD$iQ)eoW!$2IT+l?k><O&SBeSXF=s*=Z5B0b?86)`}=Ux zYMmAo6+6Dv<c@DV^`O!@3s7RW3sB+?lm9xT`?0d05^zPQz-8tKy-iLO-tA~-KBFO1 zrGS~8ou7ll_%$jkE2}LZ+M=UCFF{N11;wLa00boA9oX@vKmUdQGC@5XNiq~gl@tr} zsA-310pyzoY9dfxs1mxBFTez?3xU@eNmZ;}9D;rKTiWGT-8`FKb@$JoVfpJ?#oD(~ z13^C^Atn8+Ta^#`Q1AU)(4_OMU+*vfkPgNc^t)=#$S?x@S^Vi!lH3!Rh#elf0(0Bi zDji;jt_nq}7K{!MrZ5h612O<e7gQS{nG7r}?*W!fR<3MLR*|w9)Ej>d{FN>YU(=J5 zeIG@=;0_>Qknz8yj9ls!`I{ns!_$DwdN@0qYG;|RwvW*{=V|<Szt0blv-9E47nP8l zwbsL{kS8~5Dg;<1S<(o038E$yf(jCgltpI%z&Ca~xb!SJBhb!X!`Ul*GUe;>tvU{D zLJX%oWUCa@ioFyuan!TL1%$DgS$j#9nDcV?e^(!2CMHY-$O15%6*cR8pAsWq1zH)@ z5nz5v0dz1%bocO>hj0UOpRbZWm9I>Jf`SrDr<x|Px4+Wk(00kk%Ukp6U{xw>w~NN_ z1=`iw*%_ERklM>p&~Y6bW}v|RO5rVrm4^Pb4HXZ>1V~mM7@@$NcX!tjI1}VCAu6ZM z2*a~y<6sd%e1V%mC5K&{E_*I)u_p@uo}xvJ$OJ{s7TN^3dLW;G`zOo32QW2W{EQu8 z1+8V+W~xyA0T2@yzQNr-io9|LMG4ANb#1K$(>s`W^4j+AOxNMHmvfqR@&Nu1{W>{$ z=5m+$=+0<w?`0G&pnsO)=bD;tQ&O%Uq@V@Ten&%%iYkg{%9GkN8f)uxGRRTinQW}d zE%eWJl|7@W?!qTT%TV1t^J_!deU?{U4dt+)CJKNoj5C|-5^qW8X*_Rv?FMz8SHH=- z{`c?SPzpKznuGc=V(7ic4mPU!9qa=y@ZNwV-mB4R#P>cq;(Kp{1#S$T!Z43ZVzSlV zoY@PmyOQ`dA){PalNhMTVcAwG7M;rSbg@&+c9&U@BDf(>y{sywvN9PqTF!Rb+BCJa zwE8xABIMdFs^-I=7HPuxxI#xM_QjSHWv_$f>O1XX^I+VfQJYe2@Df_3<THfPV#LxV z2TzK#s27hCpzZ}cB|BO{S)=mBu(<UGK(YJQ)a6M`HK2wYyeH!62>cXU?csUFk{<(P zEh}rO$Ije!#bKM4-zwD=YYG}$VkX1u{#rLUNfDz^EEkc%x7DS~wcgfD8zN1#Op3iR zxBopagiCjHDo>nR%;%|{apv%4-z`A{5iS<Wu8I$(`o*Iej#jFo$sVTxkZimVw7^hd zc*gk{+^C`=B4q{*`{y-3%M4pUJXM2Dhl&7}CrH0Flr|H{K#mvP(eMvS=|VLyJA&zg zhpTIeuFy6f0FfJ0A{d<@DgA`&^ue*)*a$r}Zhhh((Z-MFb`Mn#Wj0i>;@WIqEwC$F zEXr&>5TT9|{B~H(iGIyT3dq&enD@$&HEG`H)3K6$KKfn@;i<P6_w&^j&Ir`K>A76q zcB^F800h`!R*h~iUiysNBz7N~UTqnjnBO4w)4e5Ow~giBzd>Buoo9cyR38w46w(ed z0q$`8<-Ntl#iZMU@;D?|T~0}SFS{#@+>h^afV^p$+t$hPdmrNg;kSWzwC_Add8wl~ z`0in2&)c^%<Kt7}eqAH%e1w|#M2+aTz&$=%r3fU%DVzws-|VBmDf&95ow$607N5X? z&BSiM?CSCGmjwd3&bKj~Ul;HNRCsv4y)+bb5jssolOu~#RU2s{Od?~z^z;twc~IXx z&$iENtXB%I*r(hnS>X91At6!v+wFykiP`A74vd(9gv9-5L(2m7D&$5htTgX|jT>!r zvqelaxLEd`Mv3#7et!pg;7#H0Spn1223Ws(C$GuTLUJB9$Fw=aOa>edZ{NOk1xYJU z!`#9G*<UJ4!1YhGMo}7<IpzVgBvS$x)+t;gqJNE^j_$X>OH)&4*hbK;Kzk%Xz;xzx zcR?D;0A*KSU*C%JD;XIXxGaoM;0a-<de+0<29{`U@lGwKQRwg>jJBSwQdl$<LWN3* zkH3KFL8)HNztvO+5J{jG(GmOrM-&Xhl7hrRK7$D`%o;Z`33O@R;glcsjK3B4b>zRu z^WdHqd(+8CjG1|l{_j?T%hyNwtI~22mT*`Y&+n4yaw;fpjL-ee<M%60y91A-?6=Z? zDu#XYZJDnbM|i06*QAJfT_~-lKGVjrdwahkE9S>^T5noOugY^_Sy>sFF}Yy%*pjNc zZOn}iPlxw)I#*(rrHQNHmt0B9EeN#Q9+Z*xXB3ToTIXJM+nj29IzG0(9z?1_bte=j zI4YTr#Ob4~2saNG8g`S%l7?~9uU21Ho{UNYT~!c#U}gj(CTQ#6Wx}{okaFZAl6*vA zePSLwhMD;z<&+mTHj3&*Px*Hb53w;Ypr3*;KWxPV-H2W#4Fen74y0Cu0gSu>jRObR z2F$3XHOk(0K|oBLH)wTJ^IsoYLCpsU>EYplg7O9?wbl01AJ+52A|tgKh;AGj5HK)6 z24|JEgM&C|{1$Iz%*|K8oK~Z^1-&v!;S$;lj1ZxeQwuoK2AhxthjuIi_0G%9RZ8Lr zXt02L0?Jx(7OYAtzxucug_bA1tZH2|1ucz5ulk+!go(Cdaaq|m3<OO~?n&aqe7f21 zx>e>rygbO0>P!hBFn}?m5l{hK2nQP*=sxf#DjJ$L=snP$pl>V#X9q`}?r#Xv3Q)U1 zR%j2#u^lM{X#$4WU=9N`^9Sa73JSNY(5wO8sjI8Qpjzoscx0rzv-6M@5A;00gH;^x zSi7vLSXo&CjRAh5q^#W2)5A-yY-3}ym5$mfZ@?DwA=b;dKi~dbrJ-?pl{f~clw$uh znka+IS;7)UI<9gkyX1X!saIW)cq9dH;S*`w)aJC!Z-Fzse-qP%8h4bZm{S73U8KQ> zqzfYZQub%#9-ov@OgF12mb<vt+8Dc@8SJi9eZ55Q6vb_DSv~cQ>Gr4l(ZtU_f7|O0 z>4E9Y05uv~r()iPwN>|rj5k%_Kl`bu$z=X5e1eavakZ6|Y(_Mi{l?xpq(Ojpi>BV~ zXl5yTnv$)py8btu`#z|n=1l$Y=Un%qT0VC9;Ea^r&<@m5@JCdA%#)`B(3o8as^bd_ z3piN_mKgx&NxHiw_4PL+h8yk7eX}0Q1mG@^=fFS*qXSY>(vx9TaVjzIdcYOHYjmN- z!l0_ms15YI56Q_zFlF2GfrhFNqBZF8&><ROss!BwzzuW)dl;|+`UI*AS5Inb5k-J4 z6~4fD;QDtgqGv^e$PLa#{Q2{zva%Ao90qD2;9Dq405%Rhn{}%!IHfUwr9oV<#>h|A z+CNc`2P<U&3c8e(RIC5{)0*njd)HF1?-0FS&#`WF%$^>v%YSYf8XB;*ii$sBegoSK z#%2&a0dnmh9Na<%<H-*wH(=kTAR>y-$Y_J7rQm-l0!#vAe((bsusvf+fZ5%m02>h| z08}7@Fbdk-+5$sUjm;=iST;!epcn~KMuKq!5(dy0oVWPa-VSK>2I0ahEH5vE?+TU> zj)Z_O3tYW7jGc?C@|f$65eGHRO9L^>oj)_hel!ZVB0vgJi^r{X&{_C1M7N>!jh((s zwBCo6{aK^i?9i9mbTa}&yX%#aj^VDK4~YgcN3GaqTHD;Gk?2XsqrL9*X9UfA5|NTn zHCT;m2f}}vpvJV^`N%@RKOoD*+G)_^>8gPrS+Qm}_s~#cUn<E+ZBKaoc${C-=fOe> z-dYjAy~Dbtd|o_^MvCLY6Mr__4{SB%cK8Wtu8YpyOVPr2C$%VXAL~6lTRCl*4f!S5 z^y-hRgx@Wkd|0O9Q9XVA3^;2610E#`#;7hr%pjQFs3<EdtEl)wi}LmL_4aO7V*p(? zI5l+|U?d)u&<k7JVl@URY%uw81`YsIO;|BVdzk)$Sw)_O<Ov;tt?gfct#A$pQW%hN z57=#sM?qW7_gDv+8`K&=sX#I205so7d+9>p^$~ac{Urs%I_1Xt0E#|r+E|J1n<atB zrEn5fXD6r7s3=Q0nOHhq4UPMWJsIF<gTfLYPlT8Na1R1IoZA=!B_u4n_5JfVYFqec zP+1gspVW?F;NVbky}+ny3MwtNmy_#+s@oMo0cXg7NhTy{85(|+%ef0mW7XInU?4!q z=K&$-o3;P?6{-7If-RA|<qmLIATW?ANPu=gea#T_X|$W<g0WVDBKX{F+=m9%T|r{# zS`zg>b%arx(q;eGulIz7Q)%CUKb8xpwAX#XNCf-M$g|sh-WO-c(a)Y5P~Rg6$R*G) zdf1y(hurmLZcvtiq@W-d%-4Di3zn?@;i>eICJ70ch}R=;X*Q|I-+ga?|K8+s-|Z^E z{`m|`o?^L&^UB^SkH6HtHRo3X5>6%oEjc~8>v3E|nDrutaGF@xda(AAUKh1+gbC~e z*eFFL7|72UiAJ3VY-sO^*?b!jWI?vuq?El6W*7o+n;LgB?5viJ@s*WMxejb}e>T}e zK)oM@(**XA93Vjr0z$FCBcbd9PojwN7lcK3V0!v<7|%df!bU<aZf$J*#O{dhS$Tto zV;Wq)Ggb2!KpYE+EqFEnN*-;Dg2x4cusAg8L#XG=tS(UEf-6=G_Qa^(aRJ6S2-<u1 z7Ft#aAaxDfe5uIczo)?r`u6RcRH-&N%)nnpaKpvmD~ZY<e1P`G#^5YWGO2*g)+?=r zQ!+5j^oF4+sTwaToY%Svtf_I+(<84sA|DpqgSil})tMOs1Uw<MNg$?RcGR7&g_AWP zHjIsF7YvV5+dv;rfymI&!9ZvfRf86zJL7N7Y<B#_#G;Kum=R3s7G5SGLIHR19!aK| z>b18J^h6BU)PRZymlR6hw*9l=7p3(^?zYn)Ij@fHouF5@xapOZz0&PWO%0>3#}Kp+ zE^D-NtnF;yEP0w-=}Y^^KV#QrsSRc1MyAxX$+Qlm`#SNRMQ}kj0afWtP1i7At(y_k zA1x){%74Go)Tm9{n=Lw5B}e<Ot6oZevAG74e^F6Gc4G=1lGX41QyAG7+~Px*x$kr} zO^M=P^4>K;(R#ETNg<XjxL8wSm`GvJNE)DfSa#CaW5g~r3ec+yLDK|Ps~1_P*Gng7 zaCvoANLUzbnzpvKG&C700byjpqgRx7P-Bvl1z|n`><ygJ_vz@~PAVn!S&V;dP#;G1 zft-W}(4Wk`1~l)-j~|EuNG1y^WdsnQOVCxoP6kEX-25Q|PM0wyC~jRB3r6Hy^QHD( zgn^W*Q52tV9U=ErSNGLwZ!+KuP~(H%zV%*Qt7L=!wqU=Y(F5eN?QJ-L1a;fU?@}vI z1BXV;8^l_WPeCZ~gSrLPBP~50AlC$R4LI2WJ?drztg8#I4zT`=8-Rk<)qnH#86pV! zr>?Fp+&K9I3W7*>G_7~R5e~`-LpcI@R8Cfwli~shX*B>}@RyuXB7gZFN(efwFgR6v zdzC0CC}4FUe(=(4k2=(tjC*sfpRs@39L_4uscKLEJHAPoPBE>AO(p8|W9CQldT7-4 zJO!skSZZ-cris}zwlr#rqtoMvV($N9>dOPEY`d=y2^mTyNs5pRWy(y(2q8&C=8##D zB(suI#>_*S%t@wXP6(k4nUav1kSR02b)NT+?>m3IPj5NLeeV0ZuD$o#Ypp$+BOv%O zr49GQ&rGhpKw)<|ISrM}hb*^shG)Nw@sT5QWVQ#Ihvrn1c2i)j;bLA_)z|14pL%8U z_n(7=Ytu)UB*|?i=gKz);-}9vrWKc~rRdaWk@zZo>{dNJx+U;For_nP<Mp!d;O=0h zHl9avtfp%BbDqaVBJ;Y4YWTGBCP>dfctFnlgg>DX*^7h=;nvE+LL9xtyLWYUbjrk< zj5BmPx0nT|!ay+C1fd^IOAv&R6cV1dTH6{ti<6%P)-hNei1eb=c|`3$k;@E9PVfuT z)Z>VtHE_hz4<8IGz5Bj?bwsX%%$5bT7d%xQEs3C$85_%>GDX(22zn_e=jU(VoNztP z#fkdmHNSr03ecw|!b{ldU^tuQ_n=&aT*Ct)X=>bN(*nxjEqauS0HP4MOv>&E-uzty zA;;+<6{nvch!kw3OCW_a6Hpl682J(fGz7vQw2F@tTCld_m1qcrx;X&F(DFz~OB2r& z$Vh6<S65a9>^cw*I-6#;p?U=J=&>}Z6fun28F57n#5JoJu=cN}Hj)vfrKM3Z=Ob~3 zj={=|3<UEZPXj8*c@Fe+cdq?xes|`oZcKC}pF#N&*fHJA_J1FS(<}WdbM|gqT-~4g zl`rdgYEvRF9i`&Tuh=4s%~;h-JSn$kw-l$380qjZ5ek^vzBetdq^!fTq_N>kqM*~x z!0PbEj?KU)_FWksQHe8sKGv><>U?QCNvrPLxJ)xAc1?E|c~yTw)#QF&EHZ{6O9yHC zza?s6Q9&S>JSh~kEW3S;B;Cxvudt*x#q)=^PnRwZJAbfU>WZK|gvKGlab`KmPizi+ zJJ7U{TRHWW5CJYxhaUp7gLnv)Vt(0BGZy*`>l+wgGkIXY;DCm1aus*XZqh|W>eWs0 zM2IMnBj9jH2oPeaMv?_mOTC2<1w|nAX8_9&GBBXwO+<~W1?<s1N2Z69-|)SC+*!78 zwG^P>1$lW$B6P8CDz~>hj)uB{@j9yA+1UwnQ9xMu58@9lbT_lOmcD*5kv{xN*=zDd z{%%tf#lU}E6YQGR+nJum?SI`p{lD#C@=cXDhIPcFp;(nCL*gyMI*q>^3V7+w#COy0 z><~gX?8QTv3uu@3zI&AF*T`;b%Spdeh`V8sj*NP^<F%HGDI=O)5ByZr(=WqJZLsvT zAPv29i?xbOvT3e7A3Mnon(F&(iLAerQaU0Oj>%V_>u&tJ5dXl0f-C4tef_(_!t9=_ zD=s=0QU`sEWYhN!pZ7O=PP7}?W`1E)@-I`kNx4*LVmt9w*wd!qcy=a{jEmdfYDJj# z3HuH`*V(!)l;6xx&{5qQvtXtujMvE``8A?fGO9jmVrN3O`tSyGPWcPTd6VZago%D% z|DPw=>?*&M!UlVK`&Pf)H>UZ9=eJgBg+xW)mM00#`=JRPCpeNE@7t(rS1`W5ncQZ5 z%hS<_<}Tf%vrR__>eF<={QPq)#{}Ba+<54N4GlL6n?JA50%@AGdKCPb;pG>efEj9& z_Y3Uo-0A5NR}(5LCz{g+Mn`o|+Z<VajfM_*P)kR4LazAs0m8R+%RqI}BYA>ch7OVJ z!zl24_ZMet3zhJajR>CI>sePXjP(z`d$h5STh;bRb}+Q5*yI6$fv-5DFJyx(#mDzn z=8poz!7<PYyJP8H`*%ZLrUvbdzMnsdS4d%*tAF;qTVoo3o&e1aW)1n3!%!^}_VVX% zCS18OHB50oFfg#RbUpKO*5vayIX$Sdz&i!3v3P_M7g>Ou7r&8~S^S+<fXOgFc(_DD z1{xc0^Y``M5fNh8S5)0DZlBn!;94VFYqgh|!JA*GFD$fhDerB*Ge|`@u$B?5y7V8J zBp~XZ0iynFV^l7y&2trv$fHM&l%oPeOW&(b`vQd6Z%it;Pz)ix${s3Q6)~9q+iX!x zZrA^*s_iT#p}_dyhtC2ZZ?ICf*eq4P=lZvOdigQ$feYrRS5G<{^1B^P(>%CGRYL<w zunN*Rq7FWvfoZ-U>@Bd(g*Z8h4KbhvM45U@bxsZ74D1(EGqb&njN=0XAI)=HroJM; zJq!-^n)@Aq`+|oAV(8*N_^kEh+N7|(`1qwsu72#hYa{oAz2aX-pusw-aj7H1IiMi& zxInc&^YbN%?}6l6>B35uLi&ZDx_Sz(o|iIYq3(^0iaK@W9a6rkuC5}SAWbv8^_7Uy z|A0KA7Fk<eMW^*&`KjSWN?cliS_*Nqqh^$suR2&sy&AaxaV@y}6CGJGSy@@$?BTqH zs&ST1=Q9teIrMpQ?(!dSmN4WCO`VvZ&x_&T&&K|Yp;E&v{IXMjnL*z1?v7Uh2H9Kd z)Luf2zP@#~_Q#yv1ol%emT_Lcdv}cpvQ&BpE8A_K!zx71fdKJ#&t^a&(VfF2=gDFh z8c^Zk-@xaiWOj4D4Q1E)riJeM`s>THeI$vS%P}y*z?!C<cU3)q{_&G1o@jR6y(_77 zF(f1e$%<dLfr!HodI9xUtp)Zy^9Yv!j@Lry<tOQhZyEvTBk1A7J9v|tT9EQ98*slH zqfcQqPfbj${*nm?T`eg$cMgGc=>U)aIQ5T#UGf#`?(Xgg=6m<<Mec>Z6ME>V;?T92 zo1gzZcgJTX3>h-S-fU>tg(xb8tr6gKFx+_YKf2?Qqeq`bMshrDz{3SOvA4ixXljag z-)8eCD{!aLn@0GCmcq2}>|@p?D&j@D^PW0IBtATR2o@!lh=8Er+`oUsZu!#E=DACx zoo|hh9r*w!xNs1DB-i0UQa_fKmlvVx2hI4&{|J1Gi;MOMAIZtd#l@?YA50Al3;;bK zqM#<+ZhfT`_KozvUlpd7g7-WDK=Jr7@5z(!J-{<jr?UT^-s)3!2XBEw1uhZm7Zvc9 zy-aHNSeIf9`7Wrd6W<aXe2R+;MiW>ptH3NbHa8vY?PYYip(z24FacV;x)N4@&k)p& z@{<Qauue>5MlK8r=)dW!q6w&Ke&8^G?*!Nhp~i-2B!jx;y|p@q+wZgZn7#fwQ7_&q zi&o=g2QTN&(c$3%@tN^)MmoBdJj?UI4*y)@`z`@pMv^8PMim$I$oxGSh%QUx6iOF? zje#r))wRBr6{ki!EF;jqNAt=avCkuaNH!chKj6WGt+lBfS1ktr?=D353RfNwR(W_P z!3UU_n1EmkTnH54fEgG6{vEh6c`M2`=q30rvY;MgOLtlpQ)<h;fc2NBGm`2iQbGJy z+rknAM63`L<Np>H0X?k+*&<uRf`{P+<qqQFcnU2*dbu(RqNto4C*VOK5`nH;@#-G2 z^ELegC0&-IqvKl0PWj25(9RFP!Uwx=ZMXw00#{F+^R$@Qb?xUxMQ)otLy>~WnxWi9 z;$;XMOBF9art^D;Wi#HqdGhGdE+hje@;g-kL&r<F5b1r3-E?bRz^tByha-J^R4Jp8 ztqV@C1}ohn@=ixmMY=Nzg^FaWb?E-&FDQb7d-G;4k-)}e14&Mk1TuT%9RP<~ozg4Q zba-%#0|4(t1EEs<6ep(@I@&^U>Myj&cJ2f*P!rs7{V+=HWh9SbdtRchq-Wf}|B{v# zD?Jr11|U{15Oh)d?!V<vu|P#o>-+d@hlSCVpItvV`YqW6eBF(exk?#_fscN`{lRH_ zlbPv!`?iexyu}&yY0CM%FFtFNtzME$f}I9Ykb|5VH6!zGpzscmg8)*`D=gg1KL-WV z>bxbWABu|AB~Sh-a&U4Q7dszkWBXk`1mZ2en|KI{7+(4ywCwkv(EP4LVG|FHM%IGq z-q)JM6KWfVE2wVp!&q3@Y1GvMSi1lt$an4p!+&E?5v3To%V*hIYabsBP%uMx6U1jC z2M<R?$IZV!c90NZ$8T@GJqQ>Y;M~=Brd7}7@BL4pQ9t|%FeQ?6qLc}3h41IeD7D>) zemw+V@T2PgdleK2M{rC}Pop1-yc`G0CFZ*AwOlMBV)q|)6cKX+uXfZFM1sm*yNUK5 z(2gIN;Tvsi$~xf4Yl;;#D!D}z%OWDg-6QT1=#qdF0JWS~Js6~jBMk8MR8(AC!)qdq zI9u1w!C`uKmbmPZ?`^&kX+tw25NQSI2*i#EqoBXU6=UPAbr~gWAD$nGC?kzA+kZ{a zp94D+#AqZgdHMNZFoZBm<e@{2!2TY4){$L_ZWHb`t0iS#>!Om9l0riB$WftY#D~Lt z=CFjb1ximc(yGc*&Ie{qL9bAPj(6p-w5Mk3W=&m&VTt<!C<((jzXP||pn8Xv2;>Wg zzLKRb6Q3jCp`q@8GJ_GwW^^>?zDJXnZO?^WAi?)*)YH<U+rNKdapDr^PJ=5~R@T;t z9eb$ZU!9~|0BHh}8yFkla-jbCQCq8&)}Gt*2hFCSms{_%3wQs-ZN=FPdV|kx|G>a= zckX<EN)t^3RXAob*P$)xg{^>wMSLni;vXoDBxw%GPcp?@`+3FzfklqS&c+5I0ydP5 zt?ieSbVEZ!*UZi5Z#G6Wj-YC`(2Y-}pk=x>^qH1$1vo2G3R%aS+mZh%MM3@;k&V}2 zIdA~yD!^8I8=E+&X|Wv8ki2k^!+#uB74<Qo?y&Lkg{b-+IB=lp&kDMjh1RVvL(UaK z=c$&WO?X#Sw4Fbf?rH^+jg5^B&hAzxMoR5(Sni&`(1F2%1~A8T@SvWywz;Y4z~EqO z=4D=4S=sM>I|(@^m1u_yfCKm@JA3H}wf}cJM5APuV#8<HzzCRl+m$QzV++U;W9e58 z<_}eB5|89x<?HD4Bk}Y9?q<zJJMXTQp##ka-XI7SV=*$FZyg<lxPjn*A+WE7ynQC= zz45`2kBAWx9fH0QGLNt=9aN3LL4<d;wzgs|9jE-%(7?^g`rNWH8q^cKs{;HE{KF=y z=FQDb;stt=V>eA61YoVL9osVpZXn|W2}wzEGUCg#o>WjJnEO&H_yah(CZ*`eR;8bR z)_!fQa^XTKk_~JWBpa#`s7U`oKr8AHUs@^^Sl#M&D>)g^4IEld3JdGt46eUUJm$-h zFFg8Rj$&(@sr)MU{ri6l3)BcE2n%y})<R}ZztYXZNtK_Mu9*h7w-0ya!R{lOvmfY& z^ZjfCRg%7Z`xY7$WX|v~yupH~==xQ*ilhe7qGcv;TM0)q`uT74^9iQNVQ@Giz|_n7 z+XmL3&;vdPu@*{878VnD*r6dXH3fKo7)uQB-t2e<o!iV$V%!EOjK&+7%n&gQ4iCSc zeNH#*Li`L__NNUp7D*TW<7_QpYvF9t(b0jw$I8k|Q@ltcz@F~z54(SF{6KwKY$yW> zA76}=(=X{r>b=9sumD@7_&3vcK4Lhz`P$RQ5$N@h-QfuX)kdjLB$G(H=T%l#rleRN z30y1WK6PrRFCsK$fP$ObGKenV%%BA5?d~3z5+*(gEXi~LVf+RK<46$-_hjm1;!3~^ z!X*LBgpdI3Qe<b?Dpc82x-qfnDLNN>qFy1OBTt9UY&m%VYx4fc-QktzE`G<&Mz(Ps zHV42wpp&rjW3<Y;0?y6oXeU-SqLQSz__$Pejt;PxW(R)$9Q<;~T@^#;@POgZJnRI% zL+}x)e~8vZb|leHD?UPD$MpC>OYXJ6=bhO}iA!bIc90Tkm6QmkrmW9oowk0u!!knn z(oXrULy^0uxR0n#NRfRMl$MwnLBq(HlSAkn&Pn}~|NbSFLUMQKr(6CIHxt)nyK*Dj z<gB^y(qPm}E*`Wq73(y?6JE9*&8SPCT;Q$67P1!iqUDRZ>ZEs9n<pxH<~QSmO>gJZ z@zxhB%RDc|N9r4Mw_7i59)D~m-tv9=bm_BzUpilS;I8HJGetF@`}`hC+NR+Sp^q|N z%)$h6X;CwqPfjzoW_FiNIIS!W>ei-2)1xtK5VT9%M1ZE<_Iu8LlRIng>30P<yBB)P zwBbGWq+$#%(nmBBBswR^pZf3u$^hF76shpWk+qPNql3=HIM)Aw1cpO%=}4m#mIMfY zICrQ5ez!;Yo5ex}4W0+icxy|`r5RSWz@@XskN!AYKTr#lxRPKH7vyfZsrS={3C<{- z9XpHTGIAmyZK}`=x}bK)Y^d8}Iea12ib3*}zL=Q~Z-=|mWr>|5w{<^N9Z7U&_L7mH ziu%_!BqotU)l1IV8LiAFL(y)+x8V~j$Euap0WaptDz1)_m(KcDj)y{N`)}Q_wdQQQ zE=(Y30Xaj$gH}8dwM5pA3~%YZjLtIXLhx9_YI)mgjZ9_~bglk7(R@aCL0$z(3_b#d z_|fFItPvdT?d>QH_wC)AduQ%9Rw)Vb%iVPx$)K1pcI^DcN2`CbC46bbvWiemX`1iX zOdd}jz#K$P9kKs6;7+5ThJ`;QgZ9-4FpzYp8Clz>s&gi^eY?cb@=YOCyLfb9iDSo( zn}<z2_9myW`(>Wszr5*d_K{8Ch-zIqpNdNJ;l92OBymTF8ovEkZy@zTEI3X_xyYz` z#tkYgq#H!(6DsGmAfloc=P@2lvhd!D6Krhe|0|T%|Mfaul>A;9p;lXw^+>+rtK)4> zHId87CVQDrblaP{$fb~GKX@|~MCo_UaQBvg<asD*BtKHuO_dyd&7R?<x9>^t6UmjH z3=w+Oj($D0N-)L#{_TnzF}vyH&G@~kDJ&;OK!@itvL+-jf<9ZG<z#=|LHu3?$PF1x zN(}8;lq+ROkkJW~seIdi3Relf7Y#MF;)K?A$WOJ=d8O%uULGCS)_*JdFnVhI{O#v? z>j|s2&eFGIA~GS3kG(FfY8<?f9jy@csq*;n{!sJ0LY~zR#xr+!nPTOkWPoln!dPva z*`fO#(IN;blB*$~(nZ`D@La7BfhIp#f5zViC>*m@?8@J0n|w)i`7vj(4%6lytvA~s zHCbCv*zDgdx&NlORar73NaNJf^1Bx{-{sciBW*L4E=~D6Up6Z~s+_|`9|UbApMKu! zna5k6I1h<#5%@-~hBNqkl?aI2t{#cGN@O=fA0juQ_04!pCpLkRa^)mdVVM_Wc8I;T znH<S8Jp-edkx_<5YZ@n0&dy1#<6HN%=uiF5Z_=pA$^SA{u{b_%4b>wc^%DNIh1#Jx zlxiTRT&)`c1|6G_Fm?0#B^a{#`;$iQUV2lYZDY$J7Ztn*6dkic1(~9B$0_F815_`3 z8_kBsYu9ej)21qa+ShO*QSRS$gwd9}+1teQ2T1mjgPE<?u>0G6@OthQIFP3(o@tMu zPf#MPJ=i=UKtoB|H9lF8tMiKeQ3*@=!s@l$$+al+`jPn(mdLrCfWDG%ZLPC>64(qi zuM(h1slH1X`Q(!la(cho{q1<;XC-TYY1buQ+Aw8N?6+AW{eNC!a1?_r5tV6WsUIM? zvOOu)-}Y$i2-*KY`F`#$GkXQbL;Dqu&8=H9+Awu9D{$4|KlZ<uUDO3>*1so_G4sjO z)tor>@>#V2%?Zj;cPrz)eqp8|F2!f3$C6Y1O!q46B_WW>-}u0?Sk@~1u!bz=oIsxi z2p~0NkCj>$OOIcN-5B9XGbcXOsie(%fSPjm*sP`^$O&qJ$t(=ikn(CW%j9}hzYpf^ zaxkeJnX9hjiJC7r3ne{v^bkYI%_S;Db~e$qo(EBlyM7mK1e(F$Hvh`(FM=>Fnfyaq z?{$Bxit7?Teg2N9w0|7gr_j3XBeKK7(n2kZ!({r7x?dO_<;~|dGAv`%lrk(n3|khB z@t04EzHLqKf1UEQQ1|EPjk&}ovBP?A3C~k5NBnv6rgAga-;D8bfKlZg5fO=i#_G6! zv1P|TUVdK7TzDwqzfv9>Z*;Rxo#f);Icbr*HR*3C<2^oh=Ea^_>Xu37dcBED<+FgK zba=9h4`b<shtK(i5^3PbjJ$mkQzxKA1d$93!3-TJ4_|9%=w@w-@AZ-SAid$xnNZxW zJo~fY_t&MzUHr*$r(JdGlk8@_9^WkPo*IsU_x9k80gcJ5se>0T(hEeX&{VIGdT!3w z>U~XSpxQ++&7Ygki>UVSX-)4C54pzpd34}<XtJ7j+{YoW@r3)|>XOf$Gm-J|tE=AS z|J>Skb;IU@qT-5w&fL`HlbkQvUsg=_m%gf87&v<5XrG5E>*qG(6Td!qKB%s87V2+G z5-rKgORV+QqaJq76`SD3?;MzW)Dq~X%OAgsakI#SpOS24u3FZq_uc(()np3NgGx;w zj`bx!ctEN2LF<112hY3HZ9Qyz?Y6(tLXXQpIVWp(y6z_Pjz4Q7?HM^w#S?QD$EfF{ z)>ry3X=mI5jU45ZD_BKP6u7%f5dJ{nfSiZrP5?Z&wg;uP&_M<tY4i)w0OYjzUjQ`V zQEbQh$d#XKm)jv+@V<Oid`s2yXZ7`5w@(p2Tqk#7^aGhvdDa*SA+Y+uBU)=Fal^MN z_F4-!b|`Ml1Ybp412Z2EpCAxU3xAHJs!i>n*5OIX-Sv6!cPeLh?v7HnY99yF`+~tv zq$Fk*>WoZ#4f*6RejM3LdaP<_zw&1?6FGu!z%hpHjh?DxvBQVVq>m1W&ow@>J5st7 zuJ`mLgPFAowj9iYzn{hAf$C#4!CPr+YqfNgk6<i!;n11qxk-Ib+|M&7{-mOICh@Hv zc~nErb9{Pq(0%?3`~FKV*$*t9QCuSllT3nnv+fQr^4<Ne<0q9!Lj&hl@5|w{EA95y z3aE2&JbAK<{hEJsl^_i{&7oB`&b-%WzYbMR+l`U1&N_%b*r8O_-V*lp%-Y<}n%^D0 z-m&=w2FgJ-;xXspcEmp9=8%b9d=_WK=MfTlI^+!;yTDGgKFi2dZfAbr(Js-z=dcK) zvlk^Hh-l=&W_6{Wnxx($4}2RiRl(-dqK6?u^`EESh@X2ZAF6PyZOYUVK3e62(?N6* zj*o{J`V-n>fV}|q6gu{;m7lI$$v91~8dPJhsQC2LhHbF&2|7w8wtvl8Vze~D{y&QN zn)s!B=6}@B^-mNS@Wq@PKrg-<htvSQl8gIUGw+eL5tlRNSLqpsbsJSGe8u0bw+vCZ z-IDBUE520bxw(0@NY-OR?xNYR;KsCkA??qa@zZ@jN4hBXcJU<&i;A!aIhW<T44tTa zz#Hl8LY=jcwtM-IKvScbY3G}r7cVGMFSwitiSK@YZF#1g;%CL}7aD<lm=L3K{L4Mr z*=COf5zi+2WQL=06Yj7DfA82Ou`)38`Tf6Y<qOYMR?Mv)VM52ncQ-peMav%vW&Wv1 z+H$E*mq+ded7)bEptHN@$y3MKxwqWEc`M$LJ}FzYQq!93$`qw}E-wBmrBe6Pb9<=* z%C3ff70c;E@9a{2mJe;Q$gSa;)MpwuXx2rByGq6Qb}79K@7Hj-Vawc|`dCv;_P9h< z_lL_mXVbC94I?o;Bj!^0JdasZO&t7ozFxkr=hl_#m=;35USzvG+~rXlu-dN?=U{Go zf2cdSu}gua&*i2|m#wh!=TuJ_i`V66+;8#Azx(X7{mOF3)6-Qa&XmQAEm=FSb!Uu{ z_}%{ZkN8Q2_Sfv3ayniX-VW!lIj0#2co>8}w9x<)5_|zW3e}-?`s3BPJ;>CF)#6N_ z2ij@$N`PNlTUy?gl-vbY1C9oK8?L-F^}|}xOaoPjAQv%=MgY2ZI6!d@T(`37L|LF5 z%p@Tt<@BQ<11$`cfdpTCW(aD<p4Jt#UjaQrECDdc#l?ks&mQnOoPk&n&`~jk6ECnZ ztiU7zGtz|Stq(pPNoEf$>4(dYbQU>Sx*t*v_j8auD(*fSV|195w0CLd$fEEW%c04g z;ES@W9(h3Wz)$&=)#qkYMozj<ZSIW@M-pS+_6S<ti19BgdaD6mcTB_U*P439x-WHk zCG0~K_+X-OIpPZx=Pw=}bY5<)aiSo5Z<CzJcyR6Szs)qk+|~;|J>{(}^=G;E0kIkQ zokU5R7Da_N#7TbMD!~#pHvLM^kXH*DQ=doMD-K7_^W=sXP7hz)bd+^B*4MAFn6WtR z!mG&M{zfbZ4ci~CwkLA76go*$Y?$hDKZ+i?z`QE)QMR$ZyTEn*Ykx?Rx#L$eV@f~5 zGg<0w_o+$mdx6lo`ba&$C&$I`n3nC+(cMfkoxT(?-;VO~o?6BCk`m)P<9_EHhiJ`> zlhdyA9TdxM4!??Rmi&D`zPC(Cfu&>FE&fkc#V_tXwKLyvad}LBTxsWyz~F}FuQdXc zUU%xcyM@!eD59ut+Z)PEmfFaiir>hy8wv?-beDedI`2tpdT?*&o2!LOW7Bq53+xa4 zn<{_lFMU`l@XeFoW?sE<vGID1nc5L1&x*Z=2EJV-r#*bl;o9lzll7MqmgDvu&bR6C z->l1#EEf0lN;*(~aEe_l8Zhej#ztNNfp`C-(tdCrp>pv=Yagg_Q!v&61SA05;KVm| zs{|NtviQwU9UV(pU?3ChAb@~B_z+Eb5SVYk5e7gmcp{fBQORS-&e-TEsjp_56sjMT za&bQKFd=}skq*xpV8`g*psfK~_S5L-RNX9qmPv7OJwW8ZR7?WUGqg46`h<f!KGkEH zrNq_gZ}gd}OICbZJl2H<me<BqcBG5&u(My|J4kUSMmlarjxaFC`sd>JhH4Mr)UKS^ zWziF-c^7DvK7DBD-*n!{j0@-}nWl(&Ycx67@uw*zpTuu^jfUBKaKmQmv#6Bln`a_E zwL!GKO-<P}w84Ue7u}dqBH=YNf8H&bJ8B-oD~x(Yzi!wBWGA&}@OC{2VKTgE%tJ@n zeCqJe;+y<3qLY*NVm&TY2S3r6`SroZ-aJ#A=S+EgU|rqH8U<(NllCe902T;uZ4Z=N zcZ{C$@K0D>+k4WpIilyWcjA2#x~3bPG&*X<k1)L881shv^=nrH!>(U+Q)Ka6n7$ML z(gyC_c8+Gd2&5J7-W1&Ft=KQiQlWKGci-X7N$kC%Sy`{`!SZGg|18VPaPJ>cSnz#b z(Di89$;8pxq21?i{mV}qdD=^xy?P#t>z_Z;>2Y&2R0)X+Ui);3hu*FFWkqs1*Kvts zyyxzYuK0OX={f!^cHQ{2Nlu%+Q=xUss8>UynMGMCx%AMp#J|5aY`rh-af&P8nVi^9 z;Ns>{c@ipex(G8YQgo*l+?B&Zr`XfLEItg3eHb=bhv0~)#l<Xa-2$IZ5;$1`Ej49K z)y{|)bMnTy1AL8W5s38!{T4rLu&{*Ss&q^8h=4~R6&djI7cX8QYk>?xSU><Emb(Tw zTlmB3K^<LPQo`ffANcv)Yjsdp0RF_kE`U5m;`{jVd28!&{Jr#<GZ4U`0JMZ-1T4jN z2M&#or^jrbl4YRTk)my8)?*6S%`<DR6Ti|*-CX!T3c%sKmHwY2ootM$pIXZB4R<n3 zkg&+_vEOZ7_|CSKhW#3tdnXQ#an2a?KRdVYWLTqowyrthz=)-@gtc&_`Ytkgm6XfQ zOhHtQJYUWyZ`}3y;r0rGbe7T^U2=qCiMX$CHO_^w9M8(MV5o^`lf#&nbY5Oz+XL1j zOhKvc(p!(m_MRgBAzpIDOdMk(VxkySd3aYmt2QhT`;lv>Rad-uk+$7`td2?W_R46V z6klJ--HqRy7!Wi(tYQ;yl`kPq?i`R<J=jK=;8yKhr?JYH&%TE!OOV@<-oe8cdVz%g zz^LnFXOW$rPlnE?kvZezpA)W%j!Rh+cKwx{3qanKn~|%$L*|r0`JJslKOf!mqVVed z-hP;XrEQ=S)_Z+`GK%V!WVUA32sN47ah=Pm$d0~OX}=EBbGh^LPm}t!{rQ+=6m`Dp zd($TnqEfUuPt+_i2}D+tK-sUzJ`@xrbnal$pC^mwmYtUVh`la*!?7{vwQMEB$yp}( z>l(GZOh%||<mZncO(BYK8Eq<dU$8<y5h0J30G3tVkl;0qm{V8Dp{jgRYhL^LGpr)R zZJ0p(ymk9FIdQL;n!+UrMs)w<tR(?t{u(RzDwR?+L4)qT;g+M~JrK~`;CV68mU6Yo z5ekBIkYR0p)Ym^;?Si*sOmuVw)cgp7sGpI20ac))q1hPsxntC3E5i7Io8CE&KGPD~ z*qg<ybU|OvKMp{dCt&oRDj+e=0Io~~-@1E{Hl(FhNZy!VTfHg<fTyMST;+*lZjn?} z)xNA(7(;Bn+j5GSF}{;9EYwDhJm}2B+XxPE`PTTvgQfG=Ua=oN6LOh5>Y0qUG@n3W zoNXNajsfQn1~BCBAdm|aR3n5n^bJJ5cqF7;^m`ql<{mZh<F&qxYPhAMo!w!NC?-O6 zCP&m=FG*RU@K|x5$@(t=)P)8g1;TibojKl9-(B-DOwZ_)hn`uQb;S2U#@JYWY4014 zU;0)fIQK9WQpGwP;Z~zyjcN2WD0odmnHZ+mNJc7ebMvCPr*m6Yk%!xx)A{d)o#kHn zr5-ibkDEAN8&JNs=fEQgD=UWHPX3!tqf>uuUtQcm2;<mAPzzwFyUEWYyAwP1ZG;L< z@RP?>3%`@=VwvRh^=~S<bBCBAhj!97nIp7aim+_URPGE;_}~$Erea%EYG_^mak~kU zDUWi8x*1`Dfz!d&2+yWl=~`UGX4UC)Y{*k!mBz|SAi(8Uqa7_E&`2}l=I)fC<pTB+ zI%uE_CqZrtbt9-1O|Akor$t0gojjQ+={gBQ5?JXlPfU!C{%jb>N(lyMW~Ob>FPuZ@ z38Dc*^d%yEyqTy93ZIaq<d2>n{gVvPibA(@2gw+RDp*mfDcW8&mF0Y9jA0zyInf=U z2*uZuZtFVineO}H)*^O(R6Se;rkuk<+%3W+tWVXf<41D`z7(6{NACS7jL=cg7b%$i zd77X91TXAYKrJ2V5aJ9KN=c8T3O3M)Ftu_NKuUldB361bfiI~<gR!-=;Prjey=no8 zM0RTA%k)1@w?xSel9Z3-d=_{DM$`~SIYepqTyHTgRPHNPG@bp|3~#@Z<;gd?8Ie>H zs<sl$0-}N%7c{nBW(E?hMx!Zwezg@{Yw13wSn~c~f4SunYM+zM;s-`jgf)f;q@Y_} zVWL%EGy9sKR(ad5@4a5#%$O$go6HC;0;wON?k1^RT1Tc1)xM8SpE6&kzY7mG3ia-% zndrEyGRudLA7!Au!F<aAmIUFCGUA?|zl221UR2oLKH=$7>h;QF!nE|X_M(sZF^@uX z;6I<Mc7iS~ljZC3_wREjdX^Jf?@8{Nk-aheQPbTiKP!_1l)N&RA@k6;<;r^|f0ln^ zJuI~!Zig7Nb0UV*PrjKW;!EP6C(m~$btF$wjC&FH@$TJLj$>nEF#mfon)WY{$SXw` zR%W;fWHxAl!i$rlqu=AiNxe+dS{mAOt$Jt<Oe3^bCTC_eFCyx60)Pg}ta4EY$@LXj ztkkyI7TOjv5zxqjjE1Oy00g(TzAgsz8uDG-7y><<M=qiQ2MI3;85hDM82g|Aq9GIK zkF^N(tfmzVB!6~wb$$BepJ75p=8xtn1~vc(xpb)=Y$NoR@$RMZ*6NQRSz$N=1<_ic z@l>T&mi0ls<+ugCzjr(%JeL;?jWvRkB?GGmQ&Y>BB%S?L?4NhEC3!459pcE=ZaGp4 zHUn*#PCI*L0?)p~={mxy+EfyqO?9(><9gwK!~;g_f{$TAXHRy9fkwM<wstG>)6s<h zp5xiAZKp^1Z#p}yjc;jCH!7bV%dzd_^Q|6KO}UJTIfL)U_zc=>`m)qCrApnKj6cNJ z4e=h(seTabc2wNX!raQDN%ro{*Kn`iRZ@5$xsb5OfAG*)zq#vy8P(1+x4v_@7wTlJ zIZqy%x6QQ6F&6Xoy1Dr~<M&JduYGI(jtGA$%usYc(3f&<TQ@t4L#X8Hth{&j54Egx zly_hIXk^_jax~L==NE3uB%A0fT}Uct6j0}RRj2XH#=>0p)GK(&Y<#c|PO+>myy|vq zyshzb=j2)nF9uHDAXSO@edFQkd9fSC(%B)YHgC>{dh?{bRFVtRmcPUiFtn0-rjAM2 z{>jivVt^f!$or@-)+XDX?cRp!5sNQ7t}-Vk)SP_~oFILkLzM0@t#o<Po@XUDC)EE& z#!-J(Pvpyf+0=AObat2HOus?az12ohE_x&@`%~trZMcolZi1-!+!Mwst9-z!r6~CU z+!jGV&0+)}v9_9;VW$hPY}Q~`!A5BgsDOZ<2!Psf5q%>h?At<jhlf^IJf1@@C=C@- zt-0%XOBEV_5@35+TUn74&<UMxc>zO~FJHexN;2J<eV70%6?m^<DP03Q0{#@`Snl95 z#Vvq|yO)J!_Rk;mr?-FP^Fv1lHj&-!3w%60Fx-%JoupsjtUwPURCY`J+GiS>WLZd{ z!6qQU3<}0eo@mm+f<f~3pOu^2lRncceO;fGKO6Ju+D-?CWLYp64b90LYjl)(HU$|f z>@{U<5!Rxwf6alZmEt;jPd$O8Xq{qT_4H4b&*qGC=W6<av`8a@KI9tP-nUx5n&*TL z?xoxnPoCmmL%2bDkW$d9N+g?nc;4+`Ort8zj__EmEct9U31<JgK-)vA?Y_ieUs;hD z+9iFoYG?<6i~jk=r|KI`Hl8GY80NX)cgOIz?v>2IV6OVCR@uAnw;#V-DeH0%I+|^y z6Tjzf3Z`dP?}XD^OS*gtn+=n#@!s7rrnzz%tR%kttRI@<=*LyU^qxu|mEVyb`C3Jt zI53L?mnsMbI0o5#_chYrDgR5ex=#IBzV)-$5f-WBZNl|gsvmuu*=~^DGAK!X`)zDL z6D`68!+y2n*20*c^>>x^X}EX@wGv~<4vifx&$WDY&r>OpCD01f&QH3{VxsI74mJU1 znH!7(F^Xj>ry|ujUX_Tg>U~?Q#DIeLU;u471Z%v|f~^`T3s#nvJ>A_KL`&`MjRj=G zD7O|ugU~v4hCnCgr2c$Y8hZP{0MIT4cOH65kjr7MyM=Tdc6@8+YP}>u2?5EUXe%+5 z*Mg}!#*odz8r4;Q+{50%LDrj=TR`CN@)?(=CzOQo_Ov}N5bEF+V$()6q=vzZg~c%H zMv~|F5RpB9pe$)tw9SWWj@L4b8Hw(!vg6-v3&X=(e-^vgCA05)Q~}qB?fJBh{fy+2 zNA$C@7yINRI8rWeyPQp7BO%nu8@VWId@B9Qi*d-QaZVOAJK|F<^ZK_AR6ie0CiuQg z|8cz2FmbiZwx@Kei|n)EyHP4z5&hFtxwkhM$$3V$?=M0Y(6D^_V)kaxc)@GU^_d)9 zZq=xHAI)pVoXivf8%28O((ZLN@L#{M+S$^Qp8gJ1i}%9xm4J;t#n_D+Lp-pcPqxm- z_S3j1>P|&PG{90WC}Q)y_xI8)!I93N0S5fJ3UdKXfo344=<>x}V~kEpf{pzC!-rwy z4+F_qpHtl2`o+$c@H9h6!p&X=zYxu6wAF}CHb^=_OF%aVA!%`LZgB1n3L}^>7=i{_ zVACF(da}@<9Ap|8rWV<D*n-gxf;Dg;bRn~K57N^=Jo^$hgnY!o7R~fAPoL(S*9JvK zMgn{MnkcaV)CbfDO-)Ud^+e$@d<h5xpq`+u_tNGr(Vc`o549SUXhGG3C(&WT2|;*= z8V&Lf!HJ&6#wk30XvW^W`Gdy(eol2pQTtfqXOn9ezGYnpkpCIfg{rC-AZr34fCc%c zTURkyOfW06o%t<pyk1I>uOQfpSdI-yPXw+)+7vJB@N4zzAOpjw`ftxYr}w4Jux~IU ztw)-*y1sq_gxJwhI7?t@avncU0<4?JAtuVp;AjCN5)_blfs^YEMjFyQ6rblcHDTy; z6$~u`_!wYEv<e^j&%^dL{!Ai=j4tHpnIEvK0M8EVc^7fm6_IoZ4gg9`j1Y>b9)#=H z0EkG0zbF+z>H-V`Fk~k+13kDB2M)j-V?B_?8{X~)<#%17;Kiv;+$QhL6pw}?@Jka` zCeo5)JB<z>PJkviOD7ZCGaJ_kJ9U8fc)|p_0|!LlW`hl3Cmc`YjLbH?on8FtBMP}W zY7p2he1M{KeSLds3ibeXxTT|xQ2B1TT7f6}=zA-v-;Krlxnv~5v}!xKqPCw099mU` z_xJL`LjL>r-dh{RnrSZR4q#*wn6M<K2rrBeYHMl+#Jyq0eTn#LM++L*4Jpyp2LB8U zrVrv&5?|dcT+R%V6vD?(pV-938LQu6-@i^xjgO0iF}1JFG3s5rCa_&yU6&x~0S&1K zx)iilkBHxZqrBr!IOngg%w3C9zdg}*98?mk+8?OF@PAT(?%?hlz}LxtCpDJ4D@rst zOhC2(xkf*R1YoQKvLQCMm+-iU8X1{8CMCVi%rs|Ez~~q_I^io`IDg(_Z82l(*`GDd zZxdQ48M@vWE|`3O7DY*b3!!=!yT7NPz@N<r8W;)19>D>o6w$o<`gJjw=pZ|T%~Be; z&EU1*^}jA~%}v7>S^_RfQEDkMF?#6a<`|WJ9$O&!2nyDJP85D}AYfM+cJ}s4ad2SS zpCEdqRXn_}@Ir+HqzXPhq7fiOOAQ}TPZs9qgWv-1GVFx^<aOxx(qW#fCK@sLcsxXC z76oZ|`S_w6ERaN+tAW{KraNHHvk%7s>_cC2Wy0taJ!PzX>{6?>P+=z8(`wa#NWdlA zb7ArNZzrbnU=6wBQI7rtVGvyd@M{XpYpJZ5_Fsd12Hzp8_^Or`oEqp39a=qB3zwIG zIFaol_=($nfn6Yu^0#qrej7WjD$C<bMLcGYEon{bQPj;<_I)K?4aHSJKU@Y@+QtUF zG|;TY;0#+!OSr6oj|U?Y+*N{Hinc081iE#&O;!r|85!^#4$w@4NggOApz0twp17%# z)d8jE9EeMci=H5?E5B;BErh%YB6JXC_dTj<Z_h`Xj6VbH0qG&Xs0g<XM-&P*U_XTC zT3onYpkKe%=02_Stl8>x*D*>qX=!GHg^<3o@&izSAa{g>AAM@QH&@K?P;G4ujEwrQ zG>LkBn|3Rl7lVU?Y~d=eTF2+yw?1&}Q+0I}A%N{JxIm9goI9v(;uek$*y-}bWwT=r z(DdpZpD8C9C#-wSIO8LLpY<ofsRB-XJW;l|NKnHpH7ViBUT7P2<)qaN@P&K&Y3Tc# zvE(g981nhRUN4aBE)hQ9)k7R6k%O%jsEx6m?!){o!gOT2g$Kkn+kN+wxm@kPd8|@# zhddo4;lZFVZPR34vbV=*j?wMgq7Tdf)-yahi^>6Hbd<P+fyqhCPO9CtpA!_+AKl$J zAI@_`AWmQ!iPd5<sJ3{)=RPj(kkUmcD%rxVQJ}&$3TZ#q4(`XF{z}=9Aq?#T_qw66 z5jz>N0KWsr8h#o69$;A^=<q5WdGFs>IFEe6X^$}g*PNXCp*Afx^q&490&7bv^4CNq zyuu4DSe3X!vW%FI!xd6x_#z;u#fXSD)X(7PLb8ZT2j&GJRE~pe=-_~p#u1ww10jy5 zR09A6E9+tE1V~dji!&f!heIcJ4#;_MfrWSzMG_p;2!A)1`!B=_3|8lChy|yxh1WMU zfT#QhQ3;k<n{bcg4gy)1cF&%oq9O`kIKc|+rzIf*I5?J}-_X5za}Ftx%<I!E)$ihU z*1BSUje-vg67K!`WQ6oXn*ke#UOc=r&vk3;^GIdl2P<={D;IS|EEjzrwU4>Zb|~^~ zzx+($J=4uxR`$ko@`L<B%p0ek)B{SN$|jzk%x2mu@T@(UpXYw&J3vSr0)o?V#KTL! z(+9^Y))vT!xw*N>&~Q>@yn1!b#wH^;n6|yU8x&rgigT-;y8~S?O$K8UAZ3Lx4U~63 zIRWbyR=yZ&0+C#T;U2}s?xv=uR#vHng-ZxdaT8`ZOoT*4EQ5%y0BR}ob93-*rQNk_ z4pHFUJ7ndeLPECv6=z(CGZdIaY`^1~z!~FpN($2dSFo#WYO+TWu!Ee2jw%3GZCSyX zrjgJ|gsWP#P-S8W+1%;|Y#@a+N=+UhVq)G9O*N7G21AV8E1ocPm69q&s6ZwW%leSm zzPfe}smW5P)xIqa$(oyGP>$hqf#}02f!~D#$sz(O4!wv53mjk)x5nuR@UsVNC?7|$ z`m{Hw+JwJIi4ck4kO9@|b)dR162R&O4f!MwPdru@G|WUxMgVu#xVuQ1jDX@LG=Kf} z4a25%L6H3)M+sIeK$>xmQTnT>s!l-hh;SFHq!9<5_y}B`9Q!N4hAxK<foMWGsGs2b z!uh+Yw$>8&o79h$o7)DP4f%1u=Rz}T9xggcs1Gqp@gHni&|A|;mchUWI2pLYCR|tp z%WxTqHDVKY{>x^>aGagkPHR1zx??v6wz7FK2_9n&`tFh|pYdAyGm$gX?0I*2wklDM zj_fh*>yWssnAY_{^IUF^xYE6u?J@bh9<q>oy43WwG$VCa?3wH2vsR}m6N0ZXS{I^j zC-b`|t~{x-J22hr$T0ukomcF?w~s$;8LvN)R8sV%IlQmT`F(Eo=9}c`A#wSO93eyY zqROq$JHpUq-Y<CsPN>W~nzz5*d$VaPbPl9A)6H*tVc8Wy$)Re_s7#0J$m>2kJ>_`@ z4j*xGy9fg#BbQx%Vq-Qm>>^+n69s;w&xt0e0=*6BR|`GXAeF;OOM>AVHAHL3sf5^A zfqrMgM~*%$5Ft0aP-ctF{lp_|?H8k5susbzC-1n+E1()p1vU_9MsOfE#s@=gKum!# z2XB-)HRJs#8;qg12UIhbZ8_%!kG#~fb3A(5tG%kq?~uO!>AB4vn;NBuLWbbOaofd3 z3Posh73vKXG0Z@$%aL)zvgmdD1l$Yq@t`>m3V|evk`<c_ZfkiEj=?PrWN8{k9R4^f zJpd`fh0xX(4jvuIgZ@o*&0?#7O2(l|)GcP~Zo_N`WM?$1P7|<^1U*b@>gwzYe?TCG z$y*f&erfID96i|TV5B}_mYB!ofcjr6V2^$W){aB#%UZCt5U%ij4T>Ct%|O^G?|P$u zNmrNXWr=*z<IWvWOo_zmP!`#c&`=iaT)<BFj$0VjhJjcJ_C(JgC@z6{%iR9M*v`0l zs7k;v#UF$A2=!l7Mo7yvgD8A|za&kH?wNSRswy<45v}lIU~X5EE^kI8<^4Nm^5N<{ z*YbA})HygaGhg?_^)Qm3WRR!3uXR5rD$=UF;nPi6?9$Y&ke0DlvYy}#to_u@Y?z{* z8L1W?3vN$lW+WY%ZNjF9_-^hT_yn+@*;$7lQ#3yHRQGa&g)=h+UyP~CyTm_5md7nU zW&F&vE*e*j3<_nu`H&u4eaxEhA`C7K`c8R~Y6oK09RrO){f>K<p_3VV;J|%01-5XD ziiYnKw+xTIXrHjq&QMWTIeu9wp!RIGjMZ>{SadW4J^go#6$0NE=Z#=HPngfNQ$=Ph z!`QOu5HZ<4%hu4`OdO<%o7-c58&-wtoE}a`0wE-W!JRBdrQ%>we+o5n7Y1_7_LUYv zZ2az>K)8yIkr9R?5aS>g3<T=nvV$}j!9gU?tS#ArB#$S$<0CRwqN#c4Q6Pr5LA3X0 z<<v}sl))H+uYLRW!SMxDzhv*V2FNLY{D{w}ZiY(+I;&u&BQiIQKE$E`8W^9SpD)MS zjXVX`>mXK3fD4^kQjSCw_04-+fsBkdv?wt2#iYm|Kh}__!#}2{#5D_~k@|0&CzR=~ zTuyoU`h(U<(K}Dq)TrqWZKcaAP6f?_SbbvOkYIT0X6ee3<LsH|k4+5K)n|JDaKBo2 zg0$tT-SUfa12`o!9U7{l*yBcP-&T5W*{#|`H!>!#``xWyKaV;{uiyF`_tzo$lfP#! zqe<zli=m>=c9iUFdL~4^L&Ku`l~VqRY?L}P@<%V#(V{uR#l9s%Q?0TOFn$2rO=HLR zx)k4@$%w-8dCvW~(yGF-$mmE7X2Eyo60E$W*2q~T<>+eOs8$zcoAl*edOYJG{fR|3 zPxu5|HT$}&?XS}E<>!tht>NLRRy7UmRee`Wm#4pb-UyAppjh41{D^mq-<8^4GW}|! z)4E&z*ZTAbou*_#$MrSZGrgiyy$xc#?7?-4hvr;9{dru3Gyr)L%uxh9j<Ih4==Z$9 zO&^3*16gM<#>)|pG9>N`NYe1|Vuuj2rH|(PQxil~g^ol0X@+(NJyn1q-!dKmMEPb9 zoCUU#s3IA>R+59l!65r98tFVd=08qDGv6d^2@tIxx$XIwZM)f~p;zg7XS|&*&rWzX zR)l*6a@;3Vpi#abm{&vT&(-))?F#vk$dA7J<oj9ojUHoXImV-AN=74jaxc@dlU4WY z<1bT&g%#%ii(k}*#H2v>kSEJvv&XFyPuI3CXIx18w)uTJ^U~Nam#<FF$Q%%PKmlVp zcrcJv#mLAAhR|@+PWpY4o@&+8TTd@WhsP+_JYYczX`P}QWsQJ3$G)ZIW%Tz^*X$s~ zL+yX{D&ExF+Y7%RNIM}Ofs4_qXGM>_ri#k*bvleITJdx9f3YW!dZsU<?oY^s1plwh z9LF-J5|=E;LAxxPk_zwq;_^bfhR5yIZ05V85(^DZ$#!+ZvGvP5wa4<G?$sU{<~m-V zlTum5f5*g3jQ)-O@V^k`XqfR%-=TD?WV%#w{k><|*vHnkgf1Q@<5KN;qlO}i<>~Ez z-%B<^E2PS19Tlkwf34@XG#%DUmSx)B7cws1r9WczxQUNkey>;Zw62ls*Pj7v6~Cgb z6qR~<65Sskp!ycQzRXIhTmH|yy}UOtH0j~tB*WX9)r0<>(W%=%=4aY2+i_~f&xbnw zD=Jtl7n+TWU3r}onk<s=<W%!AUsC1TOmFeyn2MXff9`ytGCy7Ar4gL8>fYEj>a$o| z^~$xQ%ST-Q)6m>EqUNMDr+@VuX}HSIJIkVWAJoi4wdlxHRaEM~eoZ}Se`qUIy5<NI z%>zuvhQEfPp)_{=y(+onZ(qM|Ew*34OvVZSN;qYp;)Q9?khnJz3+T9zDb?U0APzr* z8;BHCPm+>sY;3@FFt5tm-57o9LW8920+@OzD&B*LQ!{OS;NBj1HDR!V?pwnin5|s8 z^z=VjDLs{vgTpDKOjs(Vw_1ZZgxsbFy|ey)jCM?jk8iTdS4(->#~4^WF+EK*`*xp4 z5&>laEb4X#vO_I{7X~o)<XLj4-gS51MMj!?1-g>Nr%x}T7Paf*^OO6_)sU;mu$v;% zM4C6M@w&`*-uqYCE3+Q%o>fm0S9N>k++yAjj~A&$-R+7|Qsr)-N=b_e&1tsUN$}Kq zH#RbsJO?L)I2xs7FCm|~JX$+l^1uh{E99(P2J?MY8>ghkz9sFIm27Qn>}u>S8Xg~P zXt#PM>ykOXaAB+9_9X@L*oO4bC#oyGW9wsO1$WL39?MV<F;fqW?7!EzvuHBQ!=uEi z@&lQCm=80bZE^p~PipGp+7p|fr8>lBJG%_|rEhpnzP=TpHa&HalFL*yVYIA(V>UGB zzS7xC^sd7L@+R9KjZC}+Il1*S<`<W!H%4boz9w#J|5_7^*gsTW=7qKIE$+@EEX`E? zXV&SC$K+z7J2gY>6Y>lE$w>i$qo?k8*FC$ZI>dQ}JT}gpu~^!3w#8I{jDjKtuc#=E zIH7<}wm2Ct$@?w(bdzIaM35Ia6~=v_%5}ecx9!M3%=$qlalJVnl{tLB79mQ7-xp*p zSUkuwt}$|kt0Z2n08WQ!3o;>p+22q3-((ImF=3<I3dLPH8C`4sLaG8D!kA}rmv`yW ziDxa=!l)ooG6RN$Sh;m-2|mkdTvQ4w9zsx9qDVx+9u*x8U1}RuHBm~Kd+i)#en{f~ zr`Kb>zxd1?8e}c476|-7!U%Lt6Qev9axZTojs4}&(@U3BA~vp8joutO&CXV?#HP9w z%&T0^$I7Mihvdph)@YSuzFy^h#=k0_a$cBvPg2@9^R72oS<ymWnjps!O(}g}G^=Cg z9rtBpX#!v3=8N)drRkAZQ&SmSyaANjcLv5kSC(BX>=q2H?ycMN(w9R{Ta7I!Z>ztz zW;>y3&^PG(*lTBZZ8e1(Dl1<d5{9WpVuw{KKMuqv=y{YB?92Gc&&o(4_g3o|rI3Ix zJDtq(CsJNQw4Zkbn@5S1zkl_8!;e-w9;qu2-0!i_;p#X_5dPV&7oEIkH?7#=C9Eo^ zuC7t1)s;rY_4{mdAAR*xwbU%B?j7aeIIfc7oF+%LqndK~SX|1jFfszh>fQnta{2Os z@_IZReTYTG&EUbT1o{=y4G6eiDk>Kmcp(=BV1pc>ZR91w5?ou?XDd&rIs>jX<a5Kn zejO*)$Dj)<d_bT>b5KN76tk5$jvv2B?}{qzhb|<YpkYm@b;GwCVE+p(Y<g5mHRf}u z39%L+%OqlJJ`Ql;YPN#IWC^Td0OBBV!ercpW?(V6POE&_J-;X@pjTkTaAhQ@z%Sv1 z3zwBbgX|95Kqm?cdZk6L&XUQgOV7N%p0A>!p|UA+nR@glLz;o{0F%3}%b(SQ+kYqO zt_JN%HqQ^AJS@9usCVE%!pFgR*e?4Mjwbi_v@#uf`Yz|j_4yAqEc}sOKg(=eraq@y z*&CLY<sl<!xcNwjGpeEb)$gkbv$Cs;KI2Pgaw-a{6uE*}p4#W@tCEIsSmx4nE$@mU zImV%?<vw9}L*>KI9UQ6Z;_hX4><u2o?lAwoZ+k7q&Ddz`eS`Iiy`H?4u5!<6JE>n_ zLwXvQ+Q6|IlCUrIvQZ{YP4Vw@Zj6OHeG4P=UN9K><|>w)IR8h=36729g}kyoZ$!h- zG95@5xY1pA-Qv;l-+RQIDz9X^pLW`>X(w{e$F}S1M*$)Chf*+0zjc&1>dTuqC&!Y% z)|i~Y?%I=n8@XVsL*U08CVH-*-MZ<FWyk8Q@{?R;0f^l?a-2X=h>ea`W`Xuts#XvX znAfwqp^@R?mJ<{9HrKsR88Po7Px;mMy{n5iN{wEr|L#;a5Wf-?7{9e&eNtH{vf=x2 z@ZI_<MQu&UF_o~@?*%hoLuztucH9mValP9*mi#*9!k1?Y^DHJwSz4KwPN>`3IWi?k zw#u@`vQkp&=gvJ6&2n9xEOIjBPF`qge=&G*h|{7|$EbPxM(+c-*Yg|4mJW1WJ6G|U zxjVtGe|p9~S=Nq9>7v=*)`a&n);5lpJ=%JP)zjxx2E+$EDEEJUQuvhB(o?x5{asp` zWx^j;2@xZ<r!K{1Wx*`o1@3*<OW&Ue*2up?PIK4jtPb;iI(E4$XPau<E<H`t*q-~8 zIP&<Jp8MSt&DdMZw>Ry0`&-@@diuP%)A=Wcd3QpGd7S7_hNaVnd-Y_BXu{=;dBfbh z<v)7QO^;NY=f{mF`H@KPF0X#EN1hI)*-3u>b9DFLE<Ox$E@Vu2>LWk<w{=1+LHa%$ znbHlKI_BGZz;{rg^lh*R`!ak|)VjQ%=Pw;khmcWDc-`35Qw)1vP%2m#^-z+=aMy_P zzHo3_TKs33-(&T(m0#|YB?$?^<fwG<NO@6cQu)lWVL3{JS+@bQQ5ME}s~z{g`VYo} zzTzpHZT7-#Wn7MI_u_JX5A*Jr9b7F=qK|5{BRDS73m&4ytX+;#+g=ZWz-rn0uIGnM zb9-(lMAePG;=;JM$Ch$DF)t`_llM~9Ql%6Wbq!2#M^Tbcee8?&<KF*rnf?e%5_2f4 zT<_b{2~VE=s?a`EXYrAoeC^EH-*pG~@AfKJd_ZBkxceLfR}d<iWwf3t{edfPNiEsk zA;cLz7{cCxjd$IUX+d4UL&IZr0{$F(x8-~y$Zks=k5-}dkNonQQ8|pG_8=vJOqaUK z{Du50{~OG5IG|th%WMXH8_VyhUn`21J?#>r^I0%Kh0-syat|4ynkz7Acy5*S?7*9K zt2+RDAJv)%k&0`wQsM)}HA&_6lF74NC8ta=*m>-ozUrPA6!Kw)8TUV`2ED#w|6ZN1 zTEPEonD0kI1lewi6z%gO$4`_>70PV=O1AIk3ID?Wirpssda=S=oyMEYu^MFlFBi8< z)=32fXm%;TlmgbL15>Y&nTyi5$4?x|zCX}J^^z+>Gd6{*jl1FT)9pp~lFO1_a&C6# zwC=xr$a~=6x1`mh@+~KxC9b8pCOGI)!KTf3NUx)u!d_6CZLxR9sRgNSS&7hG6GiQ{ zr<IFqhi(<B&d2oJr&A!ziuLv7yYTw$f2B}eG|TgQUc7)*;KAO3u3Hm$x4@Gd%IQ)T zEBy-lIL@|YOUa7GHPI;)D?F6&t*#oMrj_6d%DeJVwyeVW>xLdBbBszYCDlm=t5inD zqohN-`<nmul%%Kx5$OAz`)rSK2RYB~@2u5xTv4Yd5KI}l>F`X%HN$te1)tv8cWYwY zpV#fm(O+`n`}V6@GrEmiY0>ZF*!7^nLFz1n0yk9{eXX-Y{$Ylou<9`h`HSD_dLW9{ z`$kV2(|DxQLHK7tpJSScm|=s~hg_~8^LW*gL&dRaV)}7K-RCG2<me)6&Gl4`;&-G( z{yn8DrTIaE^Qf^L<)T%V;hzWOf$((0xa7A*Qa_2<nNy0yQ8BKdk}mXS8tNTNlPP^{ z^pYF=!ka#QW&fnmdUTWEH?;N4T<-_%-+g9`opy|-f?xK8lHYF`WG%fL98MzCb(IC( zP#~ok(>kTG$Al{g)8<(WO=g%t<XT%>`#HY%s(}G!CgLuszY9HEuQ)m5x=`F?D80{K zH2sZ3@R8#~c}A1bcbIk6!~WPB9Zu*myCb|}_<Gbh=3ywg`*}krBfc0I#@_@S^H%JP zCpQnzZ|7o7dHERV(ytd9KuY+>yi^kg4T%`)Ded1!bN_E#!Sk%b<{!UEj>hbg^4>$~ zN5N8V+~4Cg^G$8{jZMNcbt-}QW-&+CT`OGso3>@7`7)=Ed@$&h_ELCP`uQEYXK1qk zwf<m7sD6yZ3gRAEe)lZ#UPge&KnmFP-^ewJhlU54Izo7O!dc<h>2KPj4IRCBL_8yN zB3TKV5lsa&<bJ-j3S5M@-`{tL7L?2Ie!V())TZd)v#oEM-)kSNA3wLMKzSpPBQB1K z?7nXR8yD$J;e=26hcl<oSldU&9IkUgMT_<r2KPu@o;b914o$*nw1FM_B*knganBnq z@}QX3<GBdP3&3@Atl%}EpHHNid35?FB#eJ?#00(aKNF=*Qdh|9r-;0n>dJKN`#M7p zy+-ASs~65?T9S%|q~|*gOr0*e8FXO(?~&|%bH(=KZ_WLW?IiOh-Th=-@QSzNpSDBl z`JCi+0Us)Rl)yknkFvvd30*_@#a`w)kbhye#zN?_p5DO&2jC-EvHZ)UKq0&Iok2GI z&|oomCD&Z(KlInr^F7!Y4h}@pISx6%DoM%7dz7udwH<5pJy5FC=bhGU?IMsbL`g{} z_lWFG>x;}tYqtkXnp{C^6JuX9rOK^%tIh`A^9%c(8-Bw_^=?^?ub9sF`@8&CCbM$T zM@BIgw^3E4MPF-`e_BWg<evWvgaCX06$*vBhXVir-+c4UfddEhdi~>$5eb3_3JO}f zbg5G!f^mmK5sM9TFx|79K*;B?xSXLFF54Zr)j<H}Ci)c)8yz}c@V6L^Mx)V4l1w7^ zFL-!(Jk}uWU&dfC7>&lq)JuV)C<H;2N+m5{=;XfvKLq_FRP%HD00000NkvXXu0mjf Dg+;w& literal 0 HcmV?d00001 diff --git a/docs/assets/design/hybrid_kv_cache_manager/sw_attn.png b/docs/assets/design/hybrid_kv_cache_manager/sw_attn.png new file mode 100644 index 0000000000000000000000000000000000000000..10aa6146dc7ab478dc60c9383ab9fdb1c5de9d30 GIT binary patch literal 4560 zcmbtYcT`i)mq&hnMG*vnAX24w5D2~fq>4fSBho=4p*KOg^biOLh*U9vAfVI;NC_oK zQy>(n2{l9lp%+D@3H#i$f9=^ncF(@^&bu@3y_val=g#MT=Az6XhK%%_^fWXyj7E3$ z9?;Oxx&UR3%e25#w}%X;p}C%8q<80G2zqNi)QXjGOt;&e{c$JemU`^lJCey}*Du;K z7h7D^FNtkv4-`+f5qmsZkr@49vZ6UzXYx@kVajI1rf6!4y)*Xm<vIk9^VR#bLNRmN zU%s$a<)56uueBdX^70lA8d20o2ip_^Aj5sAUf4xo@aPUf?I}s1Ky&IF{pt!JU(yP* z0^-&`_e22kZ~9DQKt!{Ny#@r0xS9L~Ks<Qs{8#gmfA|0G<8I9V)G(=dU&q#Vc6+`x zE;jaAVBpglJy%z1IrM#3SJ!Q7suP%^BSSgcS@~P~KA4i2__49^Y@8J)&cvtHgM|S7 ze%5{z64HD)!e-d9I@93EDq{Q@YXbs-a5&syfbi&EVI&^RlcvyAij$Cb(V~3sj|kKn z!>mwm2Jy&7Y+HHXb8>PLoGFCLr=+AvOG)`vP4I<iEaXz|6$kF8a2N-mv2lN%_w{C< ztUoVt92!yBRC73<tDVUNgTV%$x_9{i75-QA{(+W#eG1}D%PA~0eLrq*XGd9nN|?#Z z<K#?n@$z~hZ7#;K_>c{Sh<(c5Xrp$C=%1k1mG^@buhSW#ZkSgm(P)PPs=pYF!GRnh z-&<M>E?+tkPWD)zsaUrODaP9QI5`z1B_-9@*T=@vB6}Jem0-IqU{+RE{oZrf{Zs7? za-;H8=4hscls6ABbep4&qp53z1u7{I*^4MA>IkcFrxq6%qfn?4cu0tfwszqNRb5@Z zva%AIiQjU&$-Sx!Zp`%ABscyHYk4u~B}T7*c`pn*m&*R+s`}cOnXgQT5%V@JmzD;- z#RotPyKmMd3{Gz-Ph=kWtLKU=1#&Ej4e)Ss4&d=`v?7jI-&a&t#_JSSS4)xPrx_Jx zWz&XS-sI&41qFGyyW4j}&iD1*MQ-fDOiWB192{zEYtz%yLs;<iJDVi#vJySZ@d<Z| zhRxvu@~DW<sn5F{V|?Nkb63a=N?7;uq$fRtt#9Bs{DtrBoab_C%EHF`2_&mTQMthR zIb@!EKWwq}O3aSP<G!K6!54s2)z?>6SHtCBEBnlS-c3(^@nUdzn0M3)3XLPHyJB<d z>w*lqUQhd}+W^LY+KL;6FRIY|SzCXdQCKq$f5A$BNimRPRGj)@HavA|Y)syBs;a@0 zP-EMa>PGg*YH4XzS68>Rv@|p{Oixb(RrwK~Rq9rfBY3p)iMw7!39$CW);;gC@?S}Z zaq^}@3@8L0OHSnncRk?(0h_RvnV>JSUey>dxWT7N{E2F)zHlVuU1gt=r4+mCHp-`> zlIXw6J}SV*usPqnZ$8Q+Sbv345gw3JWxszWTE{5twa})jpio~_<fIqw>iRw*ApwKg zL9vQDa~Q5Wn+vhTBM{GbJL%%%<Fm4|kVqt;e>(YIo0Wq@L2GNcy**E351Cw~Yjp0v zw|_3x3SRD+{~&EWZP+xS(UQi0e3*I}DmmpkT{>I8U3<fjccD>7jJxOM{l(vbd`9*u z(%1Nf*ciYBO^2qfWX);eo-{|Zi^#cGuU=(ITDrjDAT~BX6>iRygMQIWuMxqL_wNa( zMQv?!Gc#e+b#RV#IN+#L+pA524<10yj;({rSxk;L`MGzR8fuS2Y<gnhz>0{?WZ1(z zdet)9Xmw@ywY5p}^ZDK6yUJgBdq>yy2L=WJJju<?{r1gFt59}!E*n|lZH_}H>8yP@ z$ZnZnB1uH;CVSPF1U4w|Br=A)42HJs;dQxyEjTiqBh1sDA0*MyQnh#zaN_(1zq6tB zvibBNYkVxlvToc{lJ9v~qski5l<OceYHxq|2VF4y%`Az;2LffG(Wi5+mNj<MGcyvi zdU|!vD=RDhD(KZ!Hw1HvL2_DJN5~$`)HH37;PR>7b)==WRo1$$XJCL{m%C`mD6Z^U zx2)I$hyyP_MI!)L)))!5)|$k^YKaF8H!t7`l44>42<|yzgl7KSflEbzFl5taX;Usu zW<&7XjSm=^>&d-PySAIg0pICHET&()VBq5`)K6yTM@&pkD#**z($e0h$a63xYjdjy zd4+@=?vlGX{MBu^5M7<(HTBnZz;KJ~3`?lDDsrx?)p8OXxFVf$=W;(@@}iWZW7zrE zsV^Q3qtubY;D7+zX1}78q58_Vv@wf=uNBPA&B2_E{A^rYvjjr$){NrKl}BvnEob@e z>`ItvNN7;`M_h}L(u8@AlSZ$_f$Jy4)R*<Ng@SvQj{*u*9b588HeT~IClRXv06uZ| zeV7C4yWLbs;A-!|RkrtgZmW-NF_ayB&$3OFk@n0^Mwa>LnxGg0uIos#ZyapR>N#=p zD=7TjTl(syWZQ^Jbncgukx`P9OE(ts_4SoyqrZRuJ{+g?&f3;iJ7o8ku<$>omaH+I zU{cWOwekJ6iwD&ylkJ-M$R#Pq)26d?3WXB8Kyr59h-PMCI3OJ!f2fVmFDMe9#3BkY zh1Hc|&Cb6bX*@*$7|3C;qHQ-wtmhYc;`-3!XeY!!>SRmV)N~{^HWq<El#~bs2L8eZ zP`{%@GBYzb=UbIEHSP8P`FdeM<*Mza@zVipyFk&~>em@LCqvFRd>OER+IT@0j96`? zmRP07Ed-~B-=<nsRTY5rV`F0_C#cGbio@q;l>Q8qzFNM|lPA!J4~4qz7^T}y=zQAv zqM{9=l$h`6TRi!*?JL;iT>c>y9c}Vxb)aIjoLUSWDfe8x+Z2_1T1v3RqrP@`>nf&g zE4+|$&aHg=b~kb^#&pbgmGg8>?flu!EvAmro~6Q?f;H-I6aU>zCm9_vVd2uZZ=-u* zDQ=jWn$(mO&4^=gJ|xFmK)_lah#i5?3jQcFr?s}UI0MF3Rac*#p8oS^Lrjp3f!oC1 z8%_m>&b%S&bR20&KgacAv`<gk(+<ct%SIO-1cybP(s89pj%LdB7)?CVa`UXI#)vn( z`93v0mCJGFvbNJCM^RE#oNo&qCy_czOP7Bv<F8%2)^Vys2Gx(Uu(I0Q+n1}*N6H8b zhdINhWR>f3I?7L+KDHMW^>_VNdfcbT<4c;dlPph<55@=}jzilyiu2KE5eQnPxc1$< zfAI@NaEW_+drxl<0Ku+m6Kw!qR~Yho-`sy24))d*!S`?UVz7P!rIqideQTvowA=<N z1cyCKH6@N!fytz2T3p!a!9V{t{z~}1LE0NGPEPu(SCf;IGq^eW*~`h={{H@By3WfK zN}}RBSp1nepg=fFES5)_>$iPYx5`~&_iu04w4)~65b-A0$F)T!m!&Q76jmlCK8ORE ziw%Na{7zZn*<1&yjCTHuzvo)&45(vuj%<Q5^e-Rg;pUc-mX?-~7#kmFP410}iIJ0& z3*H%M#w=s}RM1y%$k?@oTsP&``Or`s=IRywW&~FFdD4Dss4$8>Kq%0nr!3<kRjs+! zd(%FH@X@tSm+m+e)Ek^-F^P&ykh`<lny4)-jp?dk;NPFzE7=*@I=Q_pH5@wm_4TZe zDjNFW<;l^4l|`WU4$VM#9TW-$=-3k&Oziki$v|{!DkM0#$vHhlDq`X2froGTeTJyG zk&WqRi<gGwA}{l6>OxuZDE!p+m}?&CSSfD3c@A%F0sluJX0fY3^QtIRPr%7xo1ks4 zsT(Mx<JLtXpu&v&(494y?57D|1Kd44>ZZLkH8l}0(P%qAzuorm-#sunq#MF4D5$ZZ zpa3I_tNo^$1MyhrX!}X#1;|S{)ml<Z>)G#>P=<)4A3x~#3LVflILBw_i%LB|Mvjzr zUcez60o;v>CwdyCb+2x3e~AkB7z9u<&X&}=_ba{Gp0wCzgJd3?m{R6&RrOqG)w&=F z*V=UZR8dh;ZQEpHZ7mF3A|lyYSs-?H5fPDr-4BTqWZvZ73Jiw&^9cCua*<&gZ$BQN zhFjf}cQ41xHGUZx7?y+mG-|2iEHHDB{k?y77+NH^G+3~369P?$AxmTzvkc`aR<|9l zW%ZMbiVTJY?UA)5lCf%zJf?LnCU4Z{)2t}Jf+{_~6QgRaq-1g=t+~0mC%%1qsde;r zBA66G-PuW_w&o&m1u{TO3gbFgt^fjH8w@GsJQjr6`B5`_HPOR0nv;sU=ZKfsA+~$E zj-O{{faze{+Rz-VLM$i=NlC+CWV`XjxIcw8Odg&!^ZsH9V>@xFerJb|>h}(6>S<B> zjNX4Dd6xwZ=;-KN2Xi~geu++Fz}BLtr^jNkSFT*C<7Cubml_`*huYen^rZ`4=C8GC zi#m_g*6zqT{AMh~ws{s(TUy=SLXoYVypMR9lxJXcXZyEnA#bup*0LoVsnjPiACW*B zV+_aKjdXjq9KFn!U{GqFpUhlQDWMhhywdD;H1qm)*sFjw3V>t`Sbl!~99c&Z<VQvZ z1|awi0?}GmM(3^&3&2I1TU(b<g2vBwKZxNZkIpu#?~6`?KMhrtw=bqKuWxJ{JK!Tv zca;%<9alF7x2}y(bJ~PGq@IB|Q<Quc+T!S#t}!tQBIFem0Jo<C!SK;WT;~lst-mzw zuak$NnAbf#k0k`%{h||?zyA&-;r!{lYrMuvj8q@E45Ry9>I3}!f#~22y0fm519{J< z83q;-q8$I)*xK3xPAS3LbLo?ur~jHiKRPLENdj)T!w7Dd`A26XmVZNRC@aU(s(p`G zfOrhiFG|Ik@3r%_Erfj?3YRM_dLrpHl)5$g6Q;{E^u@GB>SzeoH4$&MB4_o<&D_VQ z2B7;*O~Fr}%H=~yxCnKm9m4F%6T!lfug{Z1o0Q`KI^FD^MWwj=1~n=w5iIn4hI{-} z+?-vOTE#TB1kc4~76L7pKp^+MR$P(Ves9nwk-WUT4DcNfkFcLBLqOyjZMq)mz8i*B z&5=()>ZM<pdS#HELHHVud#9!r@@r}$Fkne1UrNNdTB`lMuN!Bh3*BT9$IEu`r^lVK z0FOhXxz+Nu!VkQprH!F^Mrf@F&RMU}B~evhVmMsyU5wnaqx_xACjPlts$GBgDR<H1 zTOpH=Sn+br$2_e+Jq4l0bR`Ue$)859@~)89x3+V+>ay3(<HrD`w8S{ro0)xt_0se4 z@!jN2z~$m_kr2qRC#D@KntDrAwnO0TEkIr07r!*HgsX;r$I_%S)7#c}M;yGm1bO{C z78C@Xe+ro}>>C((RY2(P>MH*9X@4L~QiInQFlKi4LvI2<x&Iy0JzA2<+hfPaK7B1@ z@_YN>q<i-^f*m>cXcZZ~CHyss_jz13)Ex<9a(9OhOI6Mv$u`Z4`hoDH8O7H#16Br) zDjm9ddh}igN1fj27$y=`z~G0CdlYPQ^JDy&!tGIo+eaq^b6tDrjalcwk@$Kaegs>V z+bf*$;3nWeTjCums<C-_c{w>ZR?K3{LHG#|hT|=%EpF*8?ma3?naLZaVtosXK(fw& zaxL}RGd{UrlAsdV&tqAWB?$!cq}(LE7X3TsSGu1U8jED~J8S(Gmk28-L;fZL{x3)W z|H>I;QjOu?h{O+NNMD$Rz*A#?;ZYzEJp)a4MAR|>d|X$ian6z|_{Sr%z=^OoMopb8 zo0wH!ui=1+n-*x?tPC~!djgNjBoeLm3j0%<4R7V(vvA@pz}sDqv8lgTw$DxL|IA42 o+TA{-13vt}(-!|N-*J8+BiqL-GhLJu=$yt#AEH;I^EmcD05IUwsQ>@~ literal 0 HcmV?d00001 diff --git a/docs/design/hybrid_kv_cache_manager.md b/docs/design/hybrid_kv_cache_manager.md new file mode 100644 index 0000000000000..8f17b473adc08 --- /dev/null +++ b/docs/design/hybrid_kv_cache_manager.md @@ -0,0 +1,245 @@ +# Hybrid KV Cache Manager + +!!! warning + This document was written based on commit [458e74](https://github.com/vllm-project/vllm/commit/458e74eb907f96069e6d8a4f3c9f457001fef2ea). This feature is still in its early stage and things may change. + +## What is a hybrid model? + +Many recent "hybrid" LLMs combine multiple attention types within one model. For example: + +1. Sliding window attention (sw) + full attention (full): gpt-oss, Gemma 2/3, Ministral, cohere, etc. +2. Mamba + full: Bamba, Jamba, Minimax, etc. +3. Local chunked attention + full: Llama4 + +To serve these models efficiently, our [KVCacheManager][vllm.v1.core.kv_cache_manager.KVCacheManager] must: + +1. Allocate different slots to different layer type, for example: + - Full attention layers: reserve slots for **all** tokens. + - Sliding window layers: reserve slots only for the most recent **`sliding_window_size`** tokens. +2. Support layer-specific prefix-cache rules, for example: + - Full attention: a cache hit prefix requires **all** tokens remain in the KV cache. + - Sliding window: a cache hit prefix only requires the last **`sliding_window_size`** tokens remain in the KV cache. + +## Definitions + +1. **kv hidden size**: The number of bytes to store one token's KV cache for a single layer. +2. **block**: the memory reserved for kv cache are divided into multiple *blocks* with the same *page size* (defined below) +3. **block size**: number of tokens inside a block +4. **page size**: the physical memory size of a block, defined as: + + $$ + \text{num_layers} \times \text{block_size} \times \text{kv_hidden_size} + $$ + + `num_layers` doesn't mean the total number of layers in the model. The exact number depends on the context in this doc. + + !!! note + This is different from `KVCacheSpec.page_size_bytes` in the code, which is defined as: + + $$ + \text{block_size} \times \text{kv_hidden_size} + $$ + +## Allocation + +### High level idea + +We use a single memory pool for all layer types. The memory pool is split into multiple blocks with the same page size. [KVCacheManager][vllm.v1.core.kv_cache_manager.KVCacheManager] allocates different numbers of blocks to different layers according to its attention type. + +The core challenge is ensuring every layer type uses the same **page size**. For full-attention-only models, the page size is straightforward, defined as: + +$$ +\text{page_size} = \text{block_size} \times \text{num_hidden_layers} \times \text{kv_hidden_size} +$$ + +However, in hybrid models, `num_hidden_layers` varies by attention type, which would normally produce mismatched page sizes. The cases below show how we unify them. + +### Case 1: toy model + +Let's start with a toy example: a model has 1 full attention layer and 3 sliding window attention layers. All layers have the same `kv_hidden_size`. + +We let each block to hold `block_size` tokens for one layer, so: + +$$ +\text{page_size} = \text{kv_hidden_size} \times \text{block_size} +$$ + +[KVCacheManager][vllm.v1.core.kv_cache_manager.KVCacheManager] allocates a different number of blocks to each layer. + +This case is only a toy example. For real models, please refer to the following cases. + +### Case 2: same `kv_hidden_size` and a regular pattern + +When the model has more layers, e.g., 20 sliding window attention layers and 10 full attention layers with the same `kv_hidden_size`. Calling the allocator once per layer (30 calls) is OK but becomes inefficient. As a solution, we group the allocation of layers that need the same number of blocks to reduce the number of calls. + +The grouping is feasible because there is usually a beautiful ratio between the number of different types of layers. For example: + +- Gemma-2: 1 sw : 1 full +- Llama 4: 3 local : 1 full + +Our example can be regarded as 2 sw : 1 full. We can allocate blocks as if there are 2 sw and 1 full in the model, and repeat the result by 10 times to generate the `block_ids` for the 30 layers. The page size becomes: + +$$ +10 \times \text{kv_hidden_size} \times \text{block_size} +$$ + +Assume `block_size` 16, sliding window size 32, request length 112, then for the above example model, we need to allocate 11 blocks (0-6 for full, 7-8 for sw group 1, 9-10 for sw group 2). + +![Allocation Result](../assets/design/hybrid_kv_cache_manager/basic_grouping_example.png) + +Here, "/" denotes no block needed (sliding‑window layers don't need slots for early tokens). + +See the formal definition below. The layers are divided into multiple *KV Cache Groups* so that there is: + +1. **Identical attention type inside each group**: Each group only contains layers with the same attention type and thus need the same number of blocks for a given request. This enables layers in the same group share the same block ids without memory waste. +2. **Identical page size across groups**: Because our memory pool only have one page size. + +Our example model is divided into 3 KV cache groups: + +- Group 0: 10 full attention layers (full.0 - full.9) +- Group 1: 10 sliding window attention layers (sw.0 - sw.9) +- Group 2: 10 sliding window attention layers (sw.10 - sw.19) + +Obviously, it satisfies rule 1. For rule 2, all 3 groups have + +$$ +10 \times \text{kv_hidden_size} \times \text{block_size} +$$ + +as their page size. + +### Case 3: same `kv_hidden_size` and no regular pattern + +Unfortunately, not all models have such a beautiful ratio, and approach in Case 2 will produce too many small groups. For example, Gemma-3-27b has 52 sliding window attention layers and 10 full attention layers. With the constraints in case 2, it would be 26 sliding window groups and 5 full attention groups, each contains 2 layers. The allocation is still inefficient. To reduce the number of kv cache groups, we group layers using the smallest layer count among all attention types. For example, min(52, 10)=10 layers per group in Gemma-3-27b. Then the grouping result is: + +- Group 0: 10 full attention layers (full.0 - full.9) +- Group 1: 10 sliding window attention layers (sw.0 - sw.9) +- Group 2: 10 sliding window attention layers (sw.10 - sw.19) +- ... +- Group 6: 10 sliding window attention layers (sw.40 - sw.49) +- Group 7: 2 sliding window attention layers (sw.50 - sw.51) and 8 padding layers + +We will update this algorithm if this heuristic leads to a bad result when a new model comes out (e.g., 20 full + 30 sw, the group size should be 10 instead of 20). + +This case happens in Gemma-3 series models, and models in case 2 but with eagle speculative decoding which introduce one full attention layer. The solution has some memory waste and is not perfect. Please report any cases where padding overhead becomes unacceptable so we can refine the algorithm. + +### Case 4: different `kv_hidden_size` (mainly hybrid mamba models) + +Some architectures (e.g., Bamba, Jamba, Minimax) interleave standard attention layers with Mamba layers, where each Mamba layer's state size per token can be much larger than the attention layers' `kv_hidden_size`. Because we only support a single page size across all groups, we must reconcile these differing hidden sizes. + +The current algorithm is: + +1. Increase the `block_size` of attention layers until + $$ + \text{block_size} \times \text{kv_hidden_size}_{\text{att}} \ge \text{state_size}_{\text{mamba}} + $$ +2. Pad the mamba state per layer to + $$ + \text{block_size} \times \text{kv_hidden_size}_{\text{att}} + $$ +3. Apply the grouping strategy in case 3. + +!!! note + This can lead to more than 400 `block_size` for attention layers, which is too large. Another padding strategy is to increase `block_size` until + + $$ + \text{block_size} \times \text{kv_hidden_size}_{\text{att}} \times \text{num_attn_layers} \ge \text{state_size}_{\text{mamba}} + $$ + + This padding strategy is still a work in progress. + +### Case 5: KV sharing + +KV sharing refers to a layer using the KV cache of another layer, e.g., gemma-3n. +In these models, [KVCacheManager][vllm.v1.core.kv_cache_manager.KVCacheManager] ignores all layers with kv sharing and only allocates KV cache for layers that need kv cache, and some patches are made in model runner to apply the allocation result to kv sharing layers. + +## Prefix caching + +For simplicity, we assume `block_size=1` in this section. + +### High level idea + +The block pool uses a dict similar to `tuple(block_hash, group_id) -> block` to catch the full blocks. That means the same tokens of different groups are cached and evicted independently. + +When a new request comes in, we check the cache hit prefix of each group, and return the intersection of these groups as the cached prefix of the request. See below for the detailed algorithm for checking the cache hit of one group & performing the intersection. + +### Case 0: full attention only models + +For full attention layers, blocks are allocated for all tokens in the request. For details on the underlying design, see [Prefix Caching](prefix_caching.md) + +To find the longest cache hit prefix of a request, we enumerate from left (the first block) to right (the last block), checking whether the block is cached, and exit when cache misses. For example, we will return the first 7 tokens (0-6) as the cache hit prefix in the below example (blue blocks are cached): + +![Prefix Caching of Full Attention](../assets/design/hybrid_kv_cache_manager/full_attn.png) + +### Case 1: sliding window attention only models + +For sliding window attention layers, a naive implementation for memory allocation is to allocate `sliding_window_size` blocks and fill in the blocks in a round-robin way. But this naive implementation is not compatible with prefix caching so we didn't pick this design. In vLLM, we allocate different blocks for different tokens and free blocks that are outside the sliding window. + +For a new request, the cache hit prefix only requires the last `sliding_window_size - 1` tokens being cached. +Let's say `sliding_window_size = 4` and `block_size = 1`, and the request is a 15-token prompt (blue blocks are cached): + +![Prefix Caching of Sliding Window Attention](../assets/design/hybrid_kv_cache_manager/sw_attn.png) + +There are 3 possible cache hit prefixes: + +- cache hit length 5, compute prefill with [2, 3, 4] → [5, 6, …, 14] +- cache hit length 6, compute prefill with [3, 4, 5] → [6, 7, …, 14] +- cache hit length 14, compute prefill with [11, 12, 13] → [14] (most efficient) + +We can check the cache hit from right to left, and early exit when we find a match.This is opposite from full attention, where we check from left to right and early exit when the match fails. One potential cons (compared to full attention) is that we end up iterating over the entire list of tokens when there's no match, which is often a common case. This could potentially cause non-negligible overheads, but fine with full + swa, as discussed below. + +### Case 2: sliding window attention + full attention models + +The first problem is how to find the cache hit prefix. We need to "intersect" the cache hits of global and sliding window attention layers by: + +1. Get the longest cache hit for full attention (scanning from left to right) +2. Get the longest cache hit for sliding window attention that is within that length. Implemented by checking cache hits from right to left starting from the cache hit length of full attention. + +It can be ensured that the resulting cache hit of sliding window attention layers is also a cache hit of full attention layers. This is more efficient than finding all possible prefixes of each group and doing the intersection, because our approach can exit early if there is no cache hit. + +The algorithm applies to models with exactly two attention types full attention + X, where X can be an arbitrary efficient attention algorithm like sliding window, llama 4 local attention, and mamba. It doesn't support models without full attention layers, and models with more than 2 types of attention. This is enough for most hybrid models at the moment of writing this doc. + +The second question is the cache eviction policy. For now, we use one LRU queue for all kv cache groups. The blocks are added to the LRU queue when freed, either because the request is finished or the block is out of the sliding window. + +### Case 3: mamba models + +The prefix caching support of the mamba model is work in progress. Once implemented, models with mamba layer + full attention layer can be supported via the full attention + X algorithm in case 2. + +## Implementation + +### Overview + +![Overview of Hybrid KV Cache Manager](../assets/design/hybrid_kv_cache_manager/overview.png) + +The `KVCacheManager` is organized into 3 layers: + +- **[KVCacheManager][vllm.v1.core.kv_cache_manager.KVCacheManager]**: The interface between the scheduler and kv cache management system. +- **[KVCacheCoordinator][vllm.v1.core.kv_cache_coordinator.KVCacheCoordinator]**: coordinate per-group SingleTypeKVCacheManagers to generate the allocation result of a request. Depending on the model's configuration, one of these coordinators is chosen: + - **[KVCacheCoordinatorNoPrefixCache][vllm.v1.core.kv_cache_coordinator.KVCacheCoordinatorNoPrefixCache]**: Used when prefix caching is disabled. + - **[UnitaryKVCacheCoordinator][vllm.v1.core.kv_cache_coordinator.UnitaryKVCacheCoordinator]**: If only one KV cache group. The prefix caching logic is simplified as no intersection is needed. + - **[HybridKVCacheCoordinator][vllm.v1.core.kv_cache_coordinator.HybridKVCacheCoordinator]**: Handles exactly two KV cache groups (must include one full‑attention group plus one other efficient‑attention group). Other cases are not implemented. You can disable prefix caching to use the KVCacheCoordinatorNoPrefixCache. +- **[SingleTypeKVCacheManager][vllm.v1.core.single_type_kv_cache_manager.SingleTypeKVCacheManager]**: Each instance manages allocation and prefix caching for one KV cache group, implementing the attention‑type–specific logic (e.g., full attention, sliding window, Mamba). + +The blue box in the above figure shows the case with 10 full attention layers and 20 sliding window attention layers, thus: + +- use `HybridKVCacheCoordinator` +- use 1 `FullAttentionManager` and 2 `SlidingWindowManager` for the 3 `KVCacheGroup`s. + +### Memory Layout + +For a model with n `KVCacheGroup`s, each with m layers, we allocate m buffers. Each buffer is shared by n layers, one from each group. + +The following figure is for a model with 10 full attention layers (full.0 - full.9) and 20 sliding window attention layers (sw.0-sw.19). It follows "case 2" in "Allocation" section and is divided into 3 groups: + +- Group 0: 10 full attention layers (full.0 - full.9) +- Group 1: 10 sliding window attention layers (sw.0 - sw.9) +- Group 2: 10 sliding window attention layers (sw.10 - sw.19) + +And for a request, we allocate 11 blocks with `block_id` 0-6 to group 0, 7-8 to group 1, and 9-10 to group 2. + +With such an example, the physical memory is divided into 10 buffers (`KVCacheTensor` 0 - `KVCacheTensor` 9). Each buffer is shared by 3 layers (e.g., `KVCacheTensor` 0 is shared by full.0 from group 0, sw.0 from group 1, and sw.10 from group 2) and is divided into pieces with size `block_size * kv_hidden_size`. The KV cache of these 3 attention layers are saved to different pieces of the buffer based on the allocated `block_ids`: + +![Example Memory Layout](../assets/design/hybrid_kv_cache_manager/memory_layout.png) + +!!! note + One logic "block" is mapped to 10 pieces in the 10 buffers of the physical memory. From 2f13319f47eb9a78b471c5ced0fcf90862cd16a2 Mon Sep 17 00:00:00 2001 From: Huzaifa Sidhpurwala <huzaifas@redhat.com> Date: Wed, 27 Aug 2025 00:41:36 +0400 Subject: [PATCH 050/112] Enhance the pre-notification policy (#23532) Signed-off-by: Huzaifa Sidhpurwala <huzaifas@redhat.com> --- SECURITY.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/SECURITY.md b/SECURITY.md index 414669fb3712e..d6319cdb1ac27 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -42,4 +42,9 @@ For certain security issues of CRITICAL, HIGH, or MODERATE severity level, we ma * If you wish to be added to the prenotification group, please send an email copying all the members of the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). Each vendor contact will be analyzed on a case-by-case basis. +* Organizations and vendors who either ship or use vLLM, are eligible to join the prenotification group if they meet at least one of the following qualifications + * Substantial internal deployment leveraging the upstream vLLM project. + * Established internal security teams and comprehensive compliance measures. + * Active and consistent contributions to the upstream vLLM project. + * We may withdraw organizations from receiving future prenotifications if they release fixes or any other information about issues before they are public. Group membership may also change based on policy refinements for who may be included. From 6421b66bf4894a3e1e22d17c78901e3974173e09 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 26 Aug 2025 23:26:46 +0100 Subject: [PATCH 051/112] [Docs] Move quant supported hardware table to README (#23663) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/features/quantization/README.md | 48 ++++++++++++++++++- docs/features/quantization/bitblas.md | 2 +- .../quantization/supported_hardware.md | 32 ------------- 3 files changed, 48 insertions(+), 34 deletions(-) delete mode 100644 docs/features/quantization/supported_hardware.md diff --git a/docs/features/quantization/README.md b/docs/features/quantization/README.md index e18c128f30fc9..4605ba7781ed4 100644 --- a/docs/features/quantization/README.md +++ b/docs/features/quantization/README.md @@ -4,7 +4,6 @@ Quantization trades off model precision for smaller memory footprint, allowing l Contents: -- [Supported Hardware](supported_hardware.md) - [AutoAWQ](auto_awq.md) - [AutoRound](auto_round.md) - [BitsAndBytes](bnb.md) @@ -19,3 +18,50 @@ Contents: - [AMD Quark](quark.md) - [Quantized KV Cache](quantized_kvcache.md) - [TorchAO](torchao.md) + +## Supported Hardware + +The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM: + +<style> +td:not(:first-child) { + text-align: center !important; +} +td { + padding: 0.5rem !important; + white-space: nowrap; +} + +th { + padding: 0.5rem !important; + min-width: 0 !important; +} + +th:not(:first-child) { + writing-mode: vertical-lr; + transform: rotate(180deg) +} +</style> + +| Implementation | Volta | Turing | Ampere | Ada | Hopper | AMD GPU | Intel GPU | Intel Gaudi | x86 CPU | AWS Neuron | Google TPU | +|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------|--------------|--------------| +| AWQ | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ❌ | ✅︎ | ❌ | ❌ | +| GPTQ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ❌ | ✅︎ | ❌ | ❌ | +| Marlin (GPTQ/AWQ/FP8) | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | +| INT8 (W8A8) | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | +| FP8 (W8A8) | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ✅︎ | ❌ | +| BitBLAS | ✅︎ | ✅ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | +| BitBLAS (GPTQ) | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | +| bitsandbytes | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | +| DeepSpeedFP | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | +| GGUF | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | +| INC (W8A8) | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅︎ | ❌ | ❌ | ❌ | + +- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0. +- ✅︎ indicates that the quantization method is supported on the specified hardware. +- ❌ indicates that the quantization method is not supported on the specified hardware. + +!!! note + This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods. + + For the most up-to-date information on hardware support and quantization methods, please refer to <gh-dir:vllm/model_executor/layers/quantization> or consult with the vLLM development team. diff --git a/docs/features/quantization/bitblas.md b/docs/features/quantization/bitblas.md index 6f53a448ee364..53b689ad53ff6 100644 --- a/docs/features/quantization/bitblas.md +++ b/docs/features/quantization/bitblas.md @@ -5,7 +5,7 @@ vLLM now supports [BitBLAS](https://github.com/microsoft/BitBLAS) for more effic !!! note Ensure your hardware supports the selected `dtype` (`torch.bfloat16` or `torch.float16`). Most recent NVIDIA GPUs support `float16`, while `bfloat16` is more common on newer architectures like Ampere or Hopper. - For details see [supported hardware](supported_hardware.md). + For details see [supported hardware](README.md#supported-hardware). Below are the steps to utilize BitBLAS with vLLM. diff --git a/docs/features/quantization/supported_hardware.md b/docs/features/quantization/supported_hardware.md deleted file mode 100644 index 06264d08b56aa..0000000000000 --- a/docs/features/quantization/supported_hardware.md +++ /dev/null @@ -1,32 +0,0 @@ -# Supported Hardware - -The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM: - -<style> -th { - white-space: nowrap; - min-width: 0 !important; -} -</style> - -| Implementation | Volta | Turing | Ampere | Ada | Hopper | AMD GPU | Intel GPU | Intel Gaudi | x86 CPU | AWS Neuron | Google TPU | -|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------|--------------|--------------| -| AWQ | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ❌ | ✅︎ | ❌ | ❌ | -| GPTQ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ❌ | ✅︎ | ❌ | ❌ | -| Marlin (GPTQ/AWQ/FP8) | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | -| INT8 (W8A8) | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | -| FP8 (W8A8) | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ✅︎ | ❌ | -| BitBLAS (GPTQ) | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | -| bitsandbytes | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | -| DeepSpeedFP | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | -| GGUF | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | -| INC (W8A8) | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅︎ | ❌ | ❌ | ❌ | - -- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0. -- ✅︎ indicates that the quantization method is supported on the specified hardware. -- ❌ indicates that the quantization method is not supported on the specified hardware. - -!!! note - This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods. - - For the most up-to-date information on hardware support and quantization methods, please refer to <gh-dir:vllm/model_executor/layers/quantization> or consult with the vLLM development team. From c3b0fd1ee670079649cd58abd99376bee521a8ff Mon Sep 17 00:00:00 2001 From: Zhonghua Deng <abzhonghua@gmail.com> Date: Wed, 27 Aug 2025 06:56:16 +0800 Subject: [PATCH 052/112] [V1][P/D]P2pNcclConnector supports flashinfer (#23536) Signed-off-by: Abatom <abzhonghua@gmail.com> Co-authored-by: Simon Mo <simon.mo@hey.com> --- .../kv_connector/v1/p2p/p2p_nccl_connector.py | 158 +++++++++--------- 1 file changed, 78 insertions(+), 80 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 25675d70fe225..2485c57d86ecc 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -30,27 +30,19 @@ logger = init_logger(__name__) class ReqMeta: # Request Id request_id: str - # Request tokens - token_ids: torch.Tensor - # Slot mappings, should have the same length as token_ids - slot_mapping: torch.Tensor + # Request block ids + block_ids: torch.Tensor + # Request num tokens + num_tokens: int @staticmethod def make_meta(request_id: str, token_ids: list[int], block_ids: list[int], block_size: int) -> "ReqMeta": - valid_num_tokens = len(token_ids) - token_ids_tensor = torch.tensor(token_ids) block_ids_tensor = torch.tensor(block_ids) - num_blocks = block_ids_tensor.shape[0] - block_offsets = torch.arange(0, block_size) - slot_mapping = block_offsets.reshape((1, block_size)) + \ - block_ids_tensor.reshape((num_blocks, 1)) * block_size - slot_mapping = slot_mapping.flatten()[:valid_num_tokens] - return ReqMeta( request_id=request_id, - token_ids=token_ids_tensor, - slot_mapping=slot_mapping, + block_ids=block_ids_tensor, + num_tokens=len(token_ids), ) @@ -123,63 +115,58 @@ class P2pNcclConnector(KVConnectorBase_V1): return def inject_kv_into_layer( - dst_kv_cache_layer: torch.Tensor, - src_kv_cache: torch.Tensor, - slot_mapping: torch.Tensor, + layer: torch.Tensor, + kv_cache: torch.Tensor, + block_ids: torch.Tensor, request_id: str, ) -> None: - """Inject the KV cache into the layer. + """ + Inject KV cache data into a given attention layer tensor. + + This function updates `layer` in-place with values from `kv_cache`, + handling different backend layouts: + - MLA (Multi-Linear Attention) or FlashInfer: KV tensors are + indexed along the first dimension. + - FlashAttention: KV tensors are indexed along the second + dimension. + + If the number of provided block IDs does not match the number of KV + blocks, only the overlapping portion is updated, and a warning is + logged. Args: - dst_kv_cache_layer (torch.Tensor): the destination KV cache - layer. In shape [2, num_pages, page_size, xxx] if not - using MLA, [num_pages, page_size, xxx] otherwise. - src_kv_cache (torch.Tensor): the source KV cache. In shape - [2, num_tokens, xxx] if not using MLA, [num_tokens, xxx] - otherwise. - slot_mapping (torch.Tensor): the slot mapping. In shape - [num_tokens]. - request_id (str): request id for log + layer (torch.Tensor): The attention layer KV tensor to update. + kv_cache (torch.Tensor): The KV cache tensor to inject. + block_ids (torch.Tensor): Indices of the blocks to update. + request_id (str): Request identifier used for logging. + + Returns: + None. The function modifies `layer` in-place. """ - dst_kv_cache_layer_shape = dst_kv_cache_layer.shape - if isinstance(attn_metadata, MLACommonMetadata): - num_pages = dst_kv_cache_layer_shape[0] - page_size = dst_kv_cache_layer_shape[1] - dst_kv_cache_layer = dst_kv_cache_layer.reshape( - num_pages * page_size, -1) - self.check_tensors_except_dim(dst_kv_cache_layer, src_kv_cache, - 0) - num_token = src_kv_cache.shape[0] - if len(slot_mapping) == num_token: - dst_kv_cache_layer[slot_mapping, ...] = src_kv_cache + if (isinstance(attn_metadata, MLACommonMetadata) + or layer.shape[1] == 2): # MLA or FlashInfer + num_block = kv_cache.shape[0] + self.check_tensors_except_dim(layer, kv_cache, 0) + if len(block_ids) == num_block: + layer[block_ids, ...] = kv_cache else: - dst_kv_cache_layer[slot_mapping[:num_token], - ...] = src_kv_cache + layer[block_ids[:num_block], ...] = kv_cache logger.warning( - "🚧src_kv_cache does not match, num_slot:%d, " - "num_token:%d, request_id:%s", len(slot_mapping), - num_token, request_id) + "🚧kv_cache does not match, block_ids:%d, " + "num_block:%d, request_id:%s", len(block_ids), + num_block, request_id) - dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape) - else: - num_pages = dst_kv_cache_layer_shape[1] - page_size = dst_kv_cache_layer_shape[2] - dst_kv_cache_layer = dst_kv_cache_layer.reshape( - 2, num_pages * page_size, -1) - self.check_tensors_except_dim(dst_kv_cache_layer, src_kv_cache, - 1) - num_token = src_kv_cache.shape[1] - if len(slot_mapping) == num_token: - dst_kv_cache_layer[:, slot_mapping, ...] = src_kv_cache + elif layer.shape[0] == 2: # FlashAttention + num_block = kv_cache.shape[1] + self.check_tensors_except_dim(layer, kv_cache, 1) + if len(block_ids) == num_block: + layer[:, block_ids, ...] = kv_cache else: - dst_kv_cache_layer[:, slot_mapping[:num_token], - ...] = src_kv_cache + layer[:, block_ids[:num_block], ...] = kv_cache logger.warning( - "🚧src_kv_cache does not match, num_slot:%d, " - "num_token:%d, request_id:%s", len(slot_mapping), - num_token, request_id) - - dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape) + "🚧kv_cache does not match, block_ids:%d, " + "num_block:%d, request_id:%s", len(block_ids), + num_block, request_id) # Get the metadata metadata: KVConnectorMetadata = \ @@ -201,19 +188,17 @@ class P2pNcclConnector(KVConnectorBase_V1): if kv_cache is None: continue - kv_cache_layer = kv_cache[ \ - forward_context.virtual_engine] + layer = kv_cache[forward_context.virtual_engine] kv_cache = self.p2p_nccl_engine.recv_tensor( request.request_id + "#" + layer_name) if kv_cache is None: - logger.warning("🚧src_kv_cache is None, %s", - request.request_id) + logger.warning("🚧kv_cache is None, %s", request.request_id) continue - inject_kv_into_layer(kv_cache_layer, kv_cache, - request.slot_mapping, request.request_id) + inject_kv_into_layer(layer, kv_cache, request.block_ids, + request.request_id) def wait_for_layer_load(self, layer_name: str) -> None: """Blocking until the KV for a specific layer is loaded into vLLM's @@ -247,20 +232,33 @@ class P2pNcclConnector(KVConnectorBase_V1): def extract_kv_from_layer( layer: torch.Tensor, - slot_mapping: torch.Tensor, + block_ids: torch.Tensor, ) -> torch.Tensor: - """Extract the KV cache from the layer. - - Assume the shape of the layer is (2, num_pages, page_size, xxx) - if MLA is not used, and (num_pages, page_size, xxx) otherwise. """ - if isinstance(attn_metadata, MLACommonMetadata): - num_pages, page_size = layer.shape[0], layer.shape[1] - return layer.reshape(num_pages * page_size, -1)[slot_mapping, - ...] - num_pages, page_size = layer.shape[1], layer.shape[2] - return layer.reshape(2, num_pages * page_size, -1)[:, slot_mapping, - ...] + Extract KV cache slices from a given attention layer tensor. + + This function handles multiple backend layouts: + - MLA (Multi-Linear Attention) or FlashInfer: KV tensors are + indexed along the first dimension. + - FlashAttention: KV tensors are indexed along the second + dimension. + + Args: + layer (torch.Tensor): The KV cache from the attention layer. + block_ids (torch.Tensor): Indices of blocks to extract. + + Returns: + torch.Tensor: A tensor containing the extracted KV slices. + Returns None if the layout is unsupported. + """ + if (isinstance(attn_metadata, MLACommonMetadata) + or layer.shape[1] == 2): # MLA or FlashInfer + return layer[block_ids, ...] + + if layer.shape[0] == 2: # FlashAttention + return layer[:, block_ids, ...] + + return None connector_metadata = self._get_connector_metadata() assert isinstance(connector_metadata, P2pNcclConnectorMetadata) @@ -269,7 +267,7 @@ class P2pNcclConnector(KVConnectorBase_V1): ip, port = self.parse_request_id(request_id, True) remote_address = ip + ":" + str(port + self._rank) - kv_cache = extract_kv_from_layer(kv_layer, request.slot_mapping) + kv_cache = extract_kv_from_layer(kv_layer, request.block_ids) self.p2p_nccl_engine.send_tensor(request_id + "#" + layer_name, kv_cache, remote_address) From 5f1af97f86021cf2819e5ab2d84722dac53c2257 Mon Sep 17 00:00:00 2001 From: Thomas Parnell <tpa@zurich.ibm.com> Date: Wed, 27 Aug 2025 01:28:55 +0200 Subject: [PATCH 053/112] [V1] [Hybrid] Enable Full CUDA graph by default for hybrid models in V1 (#22594) Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> --- vllm/model_executor/models/config.py | 42 ++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 882df7e8162c5..f62209326b988 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -4,6 +4,7 @@ from copy import deepcopy from typing import TYPE_CHECKING import vllm.envs as envs +from vllm.config.compilation import CUDAGraphMode from vllm.logger import init_logger from vllm.model_executor.models import ModelRegistry from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv @@ -275,6 +276,42 @@ class GptOssForCausalLMConfig(VerifyAndUpdateConfig): "%d for performance.", 1024) +class MambaModelConfig(VerifyAndUpdateConfig): + + @classmethod + def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None: + """ + Enable FULL_AND_PIECEWISE cuda graph mode by default (required + to get good performance for mamba layers in V1). + + Args: + vllm_config: vLLM Config + """ + + if not envs.VLLM_USE_V1: + return + + model_config = vllm_config.model_config + compilation_config = vllm_config.compilation_config + + model_cls, _ = ModelRegistry.resolve_model_cls( + model_config.architecture, + model_config=model_config, + ) + + # TODO(tdoublep): remove as full cuda graph support is added + FCG_NOT_SUPPORTED_MODELS = [ + "Lfm2ForCausalLM", "MiniMaxText01ForCausalLM" + ] + + if (model_config.architecture not in FCG_NOT_SUPPORTED_MODELS + and compilation_config.cudagraph_mode is None): + logger.info( + "Hybrid or mamba-based model detected: setting cudagraph mode " + "to FULL_AND_PIECEWISE in order to optimize performance.") + compilation_config.cudagraph_mode = CUDAGraphMode.FULL_AND_PIECEWISE + + class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig): @classmethod @@ -293,6 +330,9 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig): if not envs.VLLM_USE_V1: return + # Enable FULL_AND_PIECEWISE by default + MambaModelConfig.verify_and_update_config(vllm_config) + cache_config = vllm_config.cache_config model_config = vllm_config.model_config parallel_config = vllm_config.parallel_config @@ -374,4 +414,6 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = { "JambaForSequenceClassification": JambaForSequenceClassificationConfig, "GraniteMoeHybridForCausalLM": GraniteMoeHybridModelConfig, "GptOssForCausalLM": GptOssForCausalLMConfig, + "MambaForCausalLM": MambaModelConfig, + "Mamba2ForCausalLM": MambaModelConfig, } From 714872f1a9c779c2ce9bbf5440f08ec278dc569a Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Tue, 26 Aug 2025 19:48:32 -0400 Subject: [PATCH 054/112] [Compile] Fix Cmake Warning (#23689) Signed-off-by: yewentao256 <zhyanwentao@126.com> --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b0ed4a284db95..b0eb0f32e03a5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -30,7 +30,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS) # Supported python versions. These versions will be searched in order, the # first match will be selected. These should be kept in sync with setup.py. # -set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12", "3.13") +set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13") # Supported AMD GPU architectures. set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201") From 585e0bde36abdb2ab2967fd42005cbe62459020e Mon Sep 17 00:00:00 2001 From: Federico <65908512+coval3nte@users.noreply.github.com> Date: Wed, 27 Aug 2025 02:29:52 +0200 Subject: [PATCH 055/112] [Bugfix] UnboundLocalError when GptOss reasoning specified (#23054) Signed-off-by: Federico <65908512+coval3nte@users.noreply.github.com> --- vllm/entrypoints/openai/serving_chat.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 8b50153f01152..7e0e627780970 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -887,7 +887,8 @@ class OpenAIServingChat(OpenAIServing): delta_message = DeltaMessage(content=delta_text) # update the previous values for the next iteration - if tool_choice_auto or self.reasoning_parser: + if ((tool_choice_auto or self.reasoning_parser) + and not self.use_harmony): assert previous_texts is not None assert all_previous_token_ids is not None previous_texts[i] = current_text From b1625dbe9cee497c0eefd9d1221377f64fec1e03 Mon Sep 17 00:00:00 2001 From: zixuanzhang226 <zixuanzhang@bytedance.com> Date: Tue, 26 Aug 2025 18:06:10 -0700 Subject: [PATCH 056/112] feat: add triton fused moe config for GLM-4.5-Air-FP8 on B200 (#23695) Signed-off-by: Zixuan Zhang <zixuanzhang@bytedance.com> --- ...evice_name=NVIDIA_B200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..b962d19506ce5 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + } +} From 6891205b161e78ea6e255da194a4470e06997a3b Mon Sep 17 00:00:00 2001 From: wuhang <wuhang6@huawei.com> Date: Wed, 27 Aug 2025 09:06:58 +0800 Subject: [PATCH 057/112] [Feature][Responses API] Support MCP tool in background mode (#23494) Signed-off-by: wuhang <wuhang6@huawei.com> --- vllm/entrypoints/context.py | 31 ++- vllm/entrypoints/openai/serving_responses.py | 265 ++++++++++--------- 2 files changed, 162 insertions(+), 134 deletions(-) diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index f70e1fc207f86..9d587e8669339 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -4,13 +4,15 @@ import json import logging from abc import ABC, abstractmethod from collections.abc import Sequence -from typing import TYPE_CHECKING, Union +from contextlib import AsyncExitStack +from typing import TYPE_CHECKING, Optional, Union from openai_harmony import Author, Message, Role, StreamState, TextContent from vllm.entrypoints.harmony_utils import ( get_encoding, get_streamable_parser_for_assistant, render_for_completion) from vllm.entrypoints.tool import Tool +from vllm.entrypoints.tool_server import ToolServer from vllm.outputs import RequestOutput if TYPE_CHECKING: @@ -37,6 +39,11 @@ class ConversationContext(ABC): def render_for_completion(self) -> list[int]: pass + @abstractmethod + async def init_tool_sessions(self, tool_server: Optional[ToolServer], + exit_stack: AsyncExitStack) -> None: + pass + class SimpleContext(ConversationContext): @@ -55,16 +62,21 @@ class SimpleContext(ConversationContext): def render_for_completion(self) -> list[int]: raise NotImplementedError("Should not be called.") + async def init_tool_sessions(self, tool_server: Optional[ToolServer], + exit_stack: AsyncExitStack) -> None: + pass + class HarmonyContext(ConversationContext): def __init__( self, messages: list, - tool_sessions: dict[str, Tool], + available_tools: list[str], ): self._messages = messages - self.tool_sessions = tool_sessions + self.available_tools = available_tools + self._tool_sessions: dict[str, Union[ClientSession, Tool]] = {} self.parser = get_streamable_parser_for_assistant() self.num_init_messages = len(messages) @@ -116,10 +128,10 @@ class HarmonyContext(ConversationContext): if recipient is not None: if recipient.startswith("browser."): return await self.call_search_tool( - self.tool_sessions["browser"], last_msg) + self._tool_sessions["browser"], last_msg) elif recipient.startswith("python"): return await self.call_python_tool( - self.tool_sessions["python"], last_msg) + self._tool_sessions["python"], last_msg) raise ValueError("No tool call found") def render_for_completion(self) -> list[int]: @@ -161,6 +173,15 @@ class HarmonyContext(ConversationContext): recipient=Role.ASSISTANT) ] + async def init_tool_sessions(self, tool_server: Optional[ToolServer], + exit_stack: AsyncExitStack) -> None: + if tool_server: + for tool_name in self.available_tools: + if tool_name not in self._tool_sessions: + self._tool_sessions[ + tool_name] = await exit_stack.enter_async_context( + tool_server.new_session(tool_name)) + class StreamingHarmonyContext(HarmonyContext): diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 5adcb310e3468..67eec2d523e3f 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -8,7 +8,7 @@ from collections.abc import AsyncGenerator, AsyncIterator, Sequence from contextlib import AsyncExitStack from copy import copy from http import HTTPStatus -from typing import Any, Callable, Final, Optional, Union +from typing import Callable, Final, Optional, Union import jinja2 import openai.types.responses as openai_responses_types @@ -248,10 +248,10 @@ class OpenAIServingResponses(OpenAIServing): raw_request.state.request_metadata = request_metadata if self.tool_server is not None and isinstance( - self.tool_server, MCPToolServer - ) and (request.background or request.stream) and request.tools and any( - tool.type in ["web_search_preview", "code_interpreter"] - for tool in request.tools): + self.tool_server, + MCPToolServer) and request.stream and request.tools and any( + tool.type in ["web_search_preview", "code_interpreter"] + for tool in request.tools): return self.create_error_response( "MCP tool server is not supported in background mode and " "streaming mode") @@ -265,103 +265,70 @@ class OpenAIServingResponses(OpenAIServing): builtin_tool_list.append("browser") if self.tool_server.has_tool("python"): builtin_tool_list.append("python") - async with AsyncExitStack() as exit_stack: - try: - if self.tool_server is not None: - # TODO: initialize tool sessions lazily when the session - # is actually used. - tool_session_ctxs: dict[str, Any] = { - tool_name: - exit_stack.enter_async_context( - self.tool_server.new_session(tool_name)) - for tool_name in builtin_tool_list - } - tool_sessions = {} - for tool_name in builtin_tool_list: - tool_sessions[tool_name] = ( - await tool_session_ctxs[tool_name]) - else: - assert len(builtin_tool_list) == 0 - tool_sessions = {} - for i, engine_prompt in enumerate(engine_prompts): - default_max_tokens = self.max_model_len - len( - engine_prompt["prompt_token_ids"]) - sampling_params = request.to_sampling_params( - default_max_tokens, self.default_sampling_params) - trace_headers = (None if raw_request is None else await - self._get_trace_headers( - raw_request.headers)) + if self.tool_server is not None: + available_tools = builtin_tool_list + else: + assert len(builtin_tool_list) == 0 + available_tools = [] + try: + for i, engine_prompt in enumerate(engine_prompts): + default_max_tokens = self.max_model_len - len( + engine_prompt["prompt_token_ids"]) + sampling_params = request.to_sampling_params( + default_max_tokens, self.default_sampling_params) - context: ConversationContext - if self.use_harmony: - if request.stream: - context = StreamingHarmonyContext( - messages, tool_sessions) - else: - context = HarmonyContext(messages, tool_sessions) + trace_headers = (None if raw_request is None else await + self._get_trace_headers(raw_request.headers)) + + context: ConversationContext + if self.use_harmony: + if request.stream: + context = StreamingHarmonyContext( + messages, available_tools) else: - context = SimpleContext() - generator = self._generate_with_builtin_tools( - request_id=request.request_id, - request_prompt=request_prompts[i], - engine_prompt=engine_prompt, - sampling_params=sampling_params, - context=context, - lora_request=lora_request, - priority=request.priority, - trace_headers=trace_headers, - ) - generators.append(generator) - except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) - - assert len(generators) == 1 - result_generator, = generators - - # Store the input messages. - if request.store: - self.msg_store[request.request_id] = messages - - if request.background: - created_time = int(time.time()) - response = ResponsesResponse.from_request( - request, - sampling_params, - model_name=model_name, - created_time=created_time, - output=[], - status="queued", - usage=None, + context = HarmonyContext(messages, available_tools) + else: + context = SimpleContext() + generator = self._generate_with_builtin_tools( + request_id=request.request_id, + request_prompt=request_prompts[i], + engine_prompt=engine_prompt, + sampling_params=sampling_params, + context=context, + lora_request=lora_request, + priority=request.priority, + trace_headers=trace_headers, ) - async with self.response_store_lock: - self.response_store[response.id] = response + generators.append(generator) + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) - # Run the request in the background. - task = asyncio.create_task( - self._run_background_request( - request, - sampling_params, - result_generator, - context, - model_name, - tokenizer, - request_metadata, - created_time, - ), - name=f"create_{response.id}", - ) + assert len(generators) == 1 + result_generator, = generators - # For cleanup. - response_id = response.id - self.background_tasks[response_id] = task - task.add_done_callback( - lambda _: self.background_tasks.pop(response_id, None)) - return response + # Store the input messages. + if request.store: + self.msg_store[request.request_id] = messages - if request.stream: - return self.responses_stream_generator( + if request.background: + created_time = int(time.time()) + response = ResponsesResponse.from_request( + request, + sampling_params, + model_name=model_name, + created_time=created_time, + output=[], + status="queued", + usage=None, + ) + async with self.response_store_lock: + self.response_store[response.id] = response + + # Run the request in the background. + task = asyncio.create_task( + self._run_background_request( request, sampling_params, result_generator, @@ -369,21 +336,41 @@ class OpenAIServingResponses(OpenAIServing): model_name, tokenizer, request_metadata, - ) + created_time, + ), + name=f"create_{response.id}", + ) - try: - return await self.responses_full_generator( - request, - sampling_params, - result_generator, - context, - model_name, - tokenizer, - request_metadata, - ) - except Exception as e: - return self.create_error_response(str(e)) - return self.create_error_response("Should not reach here") + # For cleanup. + response_id = response.id + self.background_tasks[response_id] = task + task.add_done_callback( + lambda _: self.background_tasks.pop(response_id, None)) + return response + + if request.stream: + return self.responses_stream_generator( + request, + sampling_params, + result_generator, + context, + model_name, + tokenizer, + request_metadata, + ) + + try: + return await self.responses_full_generator( + request, + sampling_params, + result_generator, + context, + model_name, + tokenizer, + request_metadata, + ) + except Exception as e: + return self.create_error_response(str(e)) async def _make_request( self, @@ -439,14 +426,16 @@ class OpenAIServingResponses(OpenAIServing): if created_time is None: created_time = int(time.time()) - try: - async for _ in result_generator: - pass - except asyncio.CancelledError: - return self.create_error_response("Client disconnected") - except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) + async with AsyncExitStack() as exit_stack: + try: + await context.init_tool_sessions(self.tool_server, exit_stack) + async for _ in result_generator: + pass + except asyncio.CancelledError: + return self.create_error_response("Client disconnected") + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) if self.use_harmony: assert isinstance(context, HarmonyContext) @@ -838,7 +827,7 @@ class OpenAIServingResponses(OpenAIServing): status_code=HTTPStatus.BAD_REQUEST, ) - async def responses_stream_generator( + async def _process_streaming_events( self, request: ResponsesRequest, sampling_params: SamplingParams, @@ -847,18 +836,8 @@ class OpenAIServingResponses(OpenAIServing): model_name: str, tokenizer: AnyTokenizer, request_metadata: RequestResponseMetadata, - created_time: Optional[int] = None, + created_time: int, ) -> AsyncGenerator[str, None]: - # TODO: - # 1. Handle disconnect - - if not isinstance(context, StreamingHarmonyContext): - raise NotImplementedError( - "Streaming is not supported for responses API without Harmony." - ) - - created_time = created_time or int(time.time()) - sequence_number = 0 def _send_event(event: BaseModel): @@ -1270,3 +1249,31 @@ class OpenAIServingResponses(OpenAIServing): sequence_number=-1, response=final_response.model_dump(), )) + + async def responses_stream_generator( + self, + request: ResponsesRequest, + sampling_params: SamplingParams, + result_generator: AsyncIterator[Optional[ConversationContext]], + context: ConversationContext, + model_name: str, + tokenizer: AnyTokenizer, + request_metadata: RequestResponseMetadata, + created_time: Optional[int] = None, + ) -> AsyncGenerator[str, None]: + # TODO: + # 1. Handle disconnect + + if not isinstance(context, StreamingHarmonyContext): + raise NotImplementedError( + "Streaming is not supported for responses API without Harmony." + ) + + created_time = created_time or int(time.time()) + + async with AsyncExitStack() as exit_stack: + await context.init_tool_sessions(self.tool_server, exit_stack) + async for event_data in self._process_streaming_events( + request, sampling_params, result_generator, context, + model_name, tokenizer, request_metadata, created_time): + yield event_data From c7c80af084e4d87c4e73148cb71ee990970281ff Mon Sep 17 00:00:00 2001 From: yzds <41983536+youzhedian@users.noreply.github.com> Date: Wed, 27 Aug 2025 09:21:11 +0800 Subject: [PATCH 058/112] fix pynccl reduce_scatter (#23648) Co-authored-by: hongchao <hongchao@msh.team> --- vllm/distributed/device_communicators/cuda_communicator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py index 0ea8de2f36f4b..eef3f9f75f9f1 100644 --- a/vllm/distributed/device_communicators/cuda_communicator.py +++ b/vllm/distributed/device_communicators/cuda_communicator.py @@ -152,7 +152,7 @@ class CudaCommunicator(DeviceCommunicatorBase): dtype=input_tensor.dtype, device=input_tensor.device) - pynccl_comm.reduce_scatter(output, input_) + pynccl_comm.reduce_scatter(output, input_tensor) # Reshape before returning return output.movedim(0, dim).contiguous() @@ -186,9 +186,9 @@ class CudaCommunicator(DeviceCommunicatorBase): device=input_tensor.device) if sizes is not None: - pynccl_comm.reduce_scatterv(output, input_, sizes=sizes) + pynccl_comm.reduce_scatterv(output, input_tensor, sizes=sizes) else: - pynccl_comm.reduce_scatter(output, input_) + pynccl_comm.reduce_scatter(output, input_tensor) # Reshape before returning return output.movedim(0, dim).contiguous() From 2c2b140ae8c60dc0c38e4d37274fc7106a72c21b Mon Sep 17 00:00:00 2001 From: czhu-cohere <conway.zhu@cohere.com> Date: Tue, 26 Aug 2025 21:23:23 -0400 Subject: [PATCH 059/112] [quantization] use channel scales for w4a8 + misc fixes (#23570) Signed-off-by: czhu-cohere <conway.zhu@cohere.com> --- tests/quantization/test_compressed_tensors.py | 44 +++++++++++++++++-- .../schemes/compressed_tensors_w4a8_fp8.py | 13 +++++- .../kernels/mixed_precision/MPLinearKernel.py | 1 + .../kernels/mixed_precision/cutlass.py | 19 ++++---- 4 files changed, 63 insertions(+), 14 deletions(-) diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 296743dbfa041..b9774b7ee2631 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -14,10 +14,10 @@ from compressed_tensors.quantization import QuantizationType from tests.models.utils import check_logprobs_close from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 CompressedTensors24, CompressedTensorsLinearMethod, - CompressedTensorsW4A4Fp4, CompressedTensorsW4A16Fp4, - CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8, - CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8, - CompressedTensorsWNA16) + CompressedTensorsW4A4Fp4, CompressedTensorsW4A8Fp8, + CompressedTensorsW4A16Fp4, CompressedTensorsW4A16Sparse24, + CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8, + CompressedTensorsW8A16Fp8, CompressedTensorsWNA16) from vllm.model_executor.layers.quantization.utils.quant_utils import ( cutlass_fp4_supported) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( @@ -683,3 +683,39 @@ def test_compressed_tensors_nvfp4(vllm_runner, args): output = llm.generate_greedy("Hello my name is", max_tokens=20) print(output) assert output + + +@pytest.mark.skipif( + not current_platform.is_cuda() + or not current_platform.has_device_capability(90), + reason="W4A8 FP8 is not yet supported on this GPU type.", +) +@pytest.mark.parametrize("args", [ + ("czhu-cohere/TinyLlama-1.1B-Chat-v1.0-W4A8-e2e", CompressedTensorsW4A8Fp8) +]) +def test_compressed_tensors_w4a8_fp8(vllm_runner, args): + model, scheme = args + with vllm_runner(model, enforce_eager=True) as llm: + + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + o_proj = layer.self_attn.o_proj + gate_up_proj = layer.mlp.gate_up_proj + down_proj = layer.mlp.down_proj + + for proj in (qkv_proj, o_proj, gate_up_proj, down_proj): + assert isinstance(proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(proj.scheme, scheme) + + assert proj.weight_packed.dtype is torch.int32 + assert proj.weight_scale.dtype is torch.float8_e4m3fn + assert proj.weight_chan_scale.dtype is torch.float32 + assert proj.scheme.group_size == 128 + + llm.apply_model(check_model) + output = llm.generate_greedy("Hello my name is", max_tokens=20) + print(output) + assert output diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py index f6cc49c2316ba..3d9827058803e 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py @@ -79,7 +79,8 @@ class CompressedTensorsW4A8Fp8(CompressedTensorsScheme): act_type=torch.float8_e4m3fn, # always use fp8(e4m3) group_size=self.group_size, zero_points=not self.symmetric, - has_g_idx=self.has_g_idx + has_g_idx=self.has_g_idx, + out_type=params_dtype ) kernel_type = choose_mp_linear_kernel(mp_linear_kernel_config) @@ -122,7 +123,7 @@ class CompressedTensorsW4A8Fp8(CompressedTensorsScheme): torch.empty( output_size_per_partition, scales_and_zp_size, - dtype=params_dtype, + dtype=torch.float8_e4m3fn, ) } @@ -140,9 +141,17 @@ class CompressedTensorsW4A8Fp8(CompressedTensorsScheme): dtype=torch.int64), weight_loader=weight_loader) + # per-channel scales + weight_chan_scale = ChannelQuantScaleParameter( + data=torch.empty((output_size_per_partition, 1), + dtype=torch.float32), + output_dim=0, + weight_loader=weight_loader) + layer.register_parameter("weight_packed", weight) layer.register_parameter("weight_scale", weight_scale) layer.register_parameter("weight_shape", weight_shape) + layer.register_parameter("weight_chan_scale", weight_chan_scale) self.kernel = kernel_type(mp_linear_kernel_config, w_q_param_name="weight_packed", diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py index 07ecc096231a4..1280f5f1eadf7 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py @@ -20,6 +20,7 @@ class MPLinearLayerConfig: group_size: int zero_points: bool has_g_idx: bool + out_type: Optional[torch.dtype] = None class MPLinearKernel(ABC): diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py index f1d49693fc016..9e23c0dd3595b 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py @@ -60,13 +60,17 @@ class CutlassW4A8LinearKernel(MPLinearKernel): if in_features % 128 or out_features % 128: return False, "K and N must be divisible by 128, got "\ f"{c.partition_weight_shape}" + + if c.out_type != torch.bfloat16: + return False, "Only bfloat16 output type currently supported"\ + f"got {c.out_type=}" + return True, None # note assumes that # `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0} # `weight_scale` is: {input_dim = 0, output_dim = 1} def process_weights_after_loading(self, layer: torch.nn.Module): - c = self.config # TODO(czhu): optimize speed/mem usage def transform_w_q(x): @@ -86,19 +90,15 @@ class CutlassW4A8LinearKernel(MPLinearKernel): # Encode/reorder weights and pack scales self._transform_param(layer, self.w_q_name, transform_w_q) self._transform_param(layer, self.w_s_name, transform_w_s) - - # TODO(czhu): support loading channel scales - self.w_ch_s = torch.ones((c.partition_weight_shape[1], ), - dtype=torch.float32, - device='cuda') + self._transform_param(layer, "weight_chan_scale", lambda x: x) def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor: - assert bias is None, "bias not supported by CUTLASS W4A8" c = self.config w_q, w_s, _, _ = self._get_weight_params(layer) + w_ch_s = layer.weight_chan_scale x_2d = x.reshape(-1, x.shape[-1]) out_shape = x.shape[:-1] + (c.partition_weight_shape[1], ) @@ -109,6 +109,9 @@ class CutlassW4A8LinearKernel(MPLinearKernel): b_group_scales=w_s, b_group_size=c.group_size, a_token_scales=act_scales, - b_channel_scales=self.w_ch_s) + b_channel_scales=w_ch_s) + + if bias is not None: + output.add_(bias) # In-place add return output.reshape(out_shape) From eb1995167e04e01c465e1cf4c39d5fd0b2031724 Mon Sep 17 00:00:00 2001 From: Chen Zhang <zhangch99@outlook.com> Date: Tue, 26 Aug 2025 18:23:26 -0700 Subject: [PATCH 060/112] [gpt-oss] Enable unit test for response API harmony integration (#23533) Signed-off-by: Chen Zhang <zhangch99@outlook.com> --- .../openai/test_response_api_with_harmony.py | 45 ++++++++++++------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py index 1ca52599c519d..72d468db08f65 100644 --- a/tests/entrypoints/openai/test_response_api_with_harmony.py +++ b/tests/entrypoints/openai/test_response_api_with_harmony.py @@ -11,18 +11,25 @@ from openai import BadRequestError, NotFoundError, OpenAI from ...utils import RemoteOpenAIServer -pytest.skip(allow_module_level=True, reason="gpt-oss can't run on CI yet.") - MODEL_NAME = "openai/gpt-oss-20b" -DTYPE = "bfloat16" @pytest.fixture(scope="module") -def server(): +def monkeypatch_module(): + from _pytest.monkeypatch import MonkeyPatch + mpatch = MonkeyPatch() + yield mpatch + mpatch.undo() + + +@pytest.fixture(scope="module") +def server(monkeypatch_module: pytest.MonkeyPatch): args = ["--enforce-eager", "--tool-server", "demo"] - with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: - yield remote_server + with monkeypatch_module.context() as m: + m.setenv("VLLM_ENABLE_RESPONSES_API_STORE", "1") + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server @pytest_asyncio.fixture @@ -269,10 +276,11 @@ async def test_stateful_multi_turn(client: OpenAI, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_streaming(client: OpenAI, model_name: str): + # TODO: Add back when web search and code interpreter are available in CI prompts = [ "tell me a story about a cat in 20 words", - "What is 13 * 24? Use python to calculate the result.", - "When did Jensen found NVIDIA? Search it and answer the year only.", + # "What is 13 * 24? Use python to calculate the result.", + # "When did Jensen found NVIDIA? Search it and answer the year only.", ] for prompt in prompts: @@ -281,15 +289,15 @@ async def test_streaming(client: OpenAI, model_name: str): input=prompt, reasoning={"effort": "low"}, tools=[ - { - "type": "web_search_preview" - }, - { - "type": "code_interpreter", - "container": { - "type": "auto" - } - }, + # { + # "type": "web_search_preview" + # }, + # { + # "type": "code_interpreter", + # "container": { + # "type": "auto" + # } + # }, ], stream=True, ) @@ -317,6 +325,7 @@ async def test_streaming(client: OpenAI, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.skip(reason="Web search tool is not available in CI yet.") async def test_web_search(client: OpenAI, model_name: str): response = await client.responses.create( model=model_name, @@ -331,6 +340,7 @@ async def test_web_search(client: OpenAI, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.skip(reason="Code interpreter tool is not available in CI yet.") async def test_code_interpreter(client: OpenAI, model_name: str): response = await client.responses.create( model=model_name, @@ -436,6 +446,7 @@ async def test_function_calling(client: OpenAI, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.flaky(reruns=5) async def test_function_calling_multi_turn(client: OpenAI, model_name: str): tools = [ { From de02b07db4741cc9ed40b8262d7a67e6bce30211 Mon Sep 17 00:00:00 2001 From: Michael Goin <mgoin64@gmail.com> Date: Tue, 26 Aug 2025 21:34:57 -0400 Subject: [PATCH 061/112] [Bugfix] Lazy import gpt_oss_triton_kernels_moe for mxfp4 (#23678) Signed-off-by: mgoin <mgoin64@gmail.com> --- vllm/model_executor/layers/quantization/mxfp4.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index df96e5d8c413e..bdeb169a4b97f 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -10,8 +10,6 @@ from vllm.config import get_current_vllm_config from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig, FusedMoEMethodBase) -from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import ( - triton_kernel_moe_forward) from vllm.model_executor.layers.linear import (LinearBase, UnquantizedLinearMethod) from vllm.model_executor.layers.quantization import QuantizationMethods @@ -557,6 +555,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): )[0] return trtllm_gen_output else: + from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import ( # noqa: E501 + triton_kernel_moe_forward) return triton_kernel_moe_forward( hidden_states=x, w1=self.w13_weight_triton_tensor, From 6dab89b8ece7e022bd3df5774c9ddf309e2eb2d9 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 27 Aug 2025 02:47:08 +0100 Subject: [PATCH 062/112] [Docs] Fix math rendering in docs (#23676) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/mkdocs/javascript/mathjax.js | 20 ++++++++++++++++++++ mkdocs.yaml | 7 ++++--- requirements/docs.txt | 1 - 3 files changed, 24 insertions(+), 4 deletions(-) create mode 100644 docs/mkdocs/javascript/mathjax.js diff --git a/docs/mkdocs/javascript/mathjax.js b/docs/mkdocs/javascript/mathjax.js new file mode 100644 index 0000000000000..5da0d443578c4 --- /dev/null +++ b/docs/mkdocs/javascript/mathjax.js @@ -0,0 +1,20 @@ +// Enables MathJax rendering +window.MathJax = { + tex: { + inlineMath: [["\\(", "\\)"]], + displayMath: [["\\[", "\\]"]], + processEscapes: true, + processEnvironments: true + }, + options: { + ignoreHtmlClass: ".*|", + processHtmlClass: "arithmatex" + } +}; + +document$.subscribe(() => { + MathJax.startup.output.clearCache() + MathJax.typesetClear() + MathJax.texReset() + MathJax.typesetPromise() +}) diff --git a/mkdocs.yaml b/mkdocs.yaml index 47fe1ebce9712..507a80c41e8b4 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -129,15 +129,16 @@ markdown_extensions: - toc: permalink: true # For math rendering - - mdx_math: - enable_dollar_delimiter: true + - pymdownx.arithmatex: + generic: true extra_css: - mkdocs/stylesheets/extra.css extra_javascript: - mkdocs/javascript/run_llm_widget.js - - https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML + - mkdocs/javascript/mathjax.js + - https://unpkg.com/mathjax@3.2.2/es5/tex-mml-chtml.js - mkdocs/javascript/edit_and_feedback.js - mkdocs/javascript/slack_and_forum.js diff --git a/requirements/docs.txt b/requirements/docs.txt index 3b72a8a9e755e..d1c546398780a 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -7,7 +7,6 @@ mkdocs-awesome-nav mkdocs-glightbox mkdocs-git-revision-date-localized-plugin mkdocs-minify-plugin -python-markdown-math regex ruff From fecbb7c782980d0d9d104784a233ecb95a20ddda Mon Sep 17 00:00:00 2001 From: Wei <weiweinpu@gmail.com> Date: Tue, 26 Aug 2025 19:54:23 -0700 Subject: [PATCH 063/112] [Bugfix][gpt-oss] passing the cache config in gpt-oss (#23613) Signed-off-by: Wei Wei <wwei6@meta.com> --- vllm/model_executor/models/gpt_oss.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index cd93f0ef1e310..9c1c05320cf36 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -174,12 +174,15 @@ class TransformerBlock(torch.nn.Module): def __init__( self, config: GptOssConfig, + cache_config: CacheConfig, quant_config: QuantizationConfig, prefix: str = "", ): super().__init__() self.layer_idx = extract_layer_index(prefix) - self.attn = OAIAttention(config, prefix=f"{prefix}.attn") + self.attn = OAIAttention(config, + prefix=f"{prefix}.attn", + cache_config=cache_config) self.mlp = MLPBlock(config, self.layer_idx, quant_config=quant_config, @@ -203,6 +206,7 @@ class GptOssModel(nn.Module): ): super().__init__() self.config = vllm_config.model_config.hf_config + self.cache_config = vllm_config.cache_config self.quant_config = vllm_config.quant_config self.parallel_config = vllm_config.parallel_config self.config.hidden_size = self.config.hidden_size @@ -213,6 +217,7 @@ class GptOssModel(nn.Module): self.layers = torch.nn.ModuleList([ TransformerBlock( self.config, + cache_config=self.cache_config, quant_config=self.quant_config, prefix=maybe_prefix(prefix, f"block.{layer_idx}"), ) for layer_idx in range(self.config.num_hidden_layers) From 786835807b491279af1fc5f565df9c6baedf3827 Mon Sep 17 00:00:00 2001 From: Yiheng Xu <charlesyihengxu@gmail.com> Date: Wed, 27 Aug 2025 10:58:32 +0800 Subject: [PATCH 064/112] [Bugfix]: Qwen3 Coder Tool Parser (#23099) Signed-off-by: Yiheng Xu <charlesyihengxu@gmail.com> Co-authored-by: Aaron Pham <contact@aarnphm.xyz> --- examples/tool_chat_template_qwen3coder.jinja | 117 ++++ tests/tool_use/test_qwen3coder_tool_parser.py | 178 +++++- .../tool_parsers/qwen3coder_tool_parser.py | 519 ++++++++++-------- 3 files changed, 571 insertions(+), 243 deletions(-) create mode 100644 examples/tool_chat_template_qwen3coder.jinja diff --git a/examples/tool_chat_template_qwen3coder.jinja b/examples/tool_chat_template_qwen3coder.jinja new file mode 100644 index 0000000000000..49b0e8d0ee7e6 --- /dev/null +++ b/examples/tool_chat_template_qwen3coder.jinja @@ -0,0 +1,117 @@ +{% macro render_extra_keys(json_dict, handled_keys) %} + {%- if json_dict is mapping %} + {%- for json_key in json_dict if json_key not in handled_keys %} + {%- if json_dict[json_key] is mapping or (json_dict[json_key] is sequence and json_dict[json_key] is not string) %} + {{- '\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | tojson | safe) ~ '</' ~ json_key ~ '>' }} + {%- else %} + {{-'\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | string) ~ '</' ~ json_key ~ '>' }} + {%- endif %} + {%- endfor %} + {%- endif %} +{% endmacro %} + +{%- if messages[0]["role"] == "system" %} + {%- set system_message = messages[0]["content"] %} + {%- set loop_messages = messages[1:] %} +{%- else %} + {%- set loop_messages = messages %} +{%- endif %} + +{%- if not tools is defined %} + {%- set tools = [] %} +{%- endif %} + +{%- if system_message is defined %} + {{- "<|im_start|>system\n" + system_message }} +{%- else %} + {%- if tools is iterable and tools | length > 0 %} + {{- "<|im_start|>system\nYou are Qwen, a helpful AI assistant that can interact with a computer to solve tasks." }} + {%- endif %} +{%- endif %} +{%- if tools is iterable and tools | length > 0 %} + {{- "\n\n# Tools\n\nYou have access to the following functions:\n\n" }} + {{- "<tools>" }} + {%- for tool in tools %} + {%- if tool.function is defined %} + {%- set tool = tool.function %} + {%- endif %} + {{- "\n<function>\n<name>" ~ tool.name ~ "</name>" }} + {%- if tool.description is defined %} + {{- '\n<description>' ~ (tool.description | trim) ~ '</description>' }} + {%- endif %} + {{- '\n<parameters>' }} + {%- if tool.parameters is defined and tool.parameters is mapping and tool.parameters.properties is defined and tool.parameters.properties is mapping %} + {%- for param_name, param_fields in tool.parameters.properties|items %} + {{- '\n<parameter>' }} + {{- '\n<name>' ~ param_name ~ '</name>' }} + {%- if param_fields.type is defined %} + {{- '\n<type>' ~ (param_fields.type | string) ~ '</type>' }} + {%- endif %} + {%- if param_fields.description is defined %} + {{- '\n<description>' ~ (param_fields.description | trim) ~ '</description>' }} + {%- endif %} + {%- set handled_keys = ['name', 'type', 'description'] %} + {{- render_extra_keys(param_fields, handled_keys) }} + {{- '\n</parameter>' }} + {%- endfor %} + {%- endif %} + {% set handled_keys = ['type', 'properties'] %} + {{- render_extra_keys(tool.parameters, handled_keys) }} + {{- '\n</parameters>' }} + {%- set handled_keys = ['type', 'name', 'description', 'parameters'] %} + {{- render_extra_keys(tool, handled_keys) }} + {{- '\n</function>' }} + {%- endfor %} + {{- "\n</tools>" }} + {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' }} +{%- endif %} +{%- if system_message is defined %} + {{- '<|im_end|>\n' }} +{%- else %} + {%- if tools is iterable and tools | length > 0 %} + {{- '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in loop_messages %} + {%- if message.role == "assistant" and message.tool_calls is defined and message.tool_calls is iterable and message.tool_calls | length > 0 %} + {{- '<|im_start|>' + message.role }} + {%- if message.content is defined and message.content is string and message.content | trim | length > 0 %} + {{- '\n' + message.content | trim + '\n' }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n<tool_call>\n<function=' + tool_call.name + '>\n' }} + {%- if tool_call.arguments is defined %} + {%- for args_name, args_value in tool_call.arguments|items %} + {{- '<parameter=' + args_name + '>\n' }} + {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %} + {{- args_value }} + {{- '\n</parameter>\n' }} + {%- endfor %} + {%- endif %} + {{- '</function>\n</tool_call>' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "user" or message.role == "system" or message.role == "assistant" %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "tool" %} + {%- if loop.previtem and loop.previtem.role != "tool" %} + {{- '<|im_start|>user\n' }} + {%- endif %} + {{- '<tool_response>\n' }} + {{- message.content }} + {{- '\n</tool_response>\n' }} + {%- if not loop.last and loop.nextitem.role != "tool" %} + {{- '<|im_end|>\n' }} + {%- elif loop.last %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/tests/tool_use/test_qwen3coder_tool_parser.py b/tests/tool_use/test_qwen3coder_tool_parser.py index 40c3158e9e683..ccb2acf512caf 100644 --- a/tests/tool_use/test_qwen3coder_tool_parser.py +++ b/tests/tool_use/test_qwen3coder_tool_parser.py @@ -16,7 +16,7 @@ from vllm.entrypoints.openai.tool_parsers.qwen3coder_tool_parser import ( from vllm.transformers_utils.detokenizer import detokenize_incrementally from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer -MODEL = "Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8" +MODEL = "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8" @pytest.fixture(scope="module") @@ -397,7 +397,9 @@ hello world "no_tools", "single_tool", "single_tool_with_content", + "single_tool_multiline_param", "parallel_tools", + "tool_with_typed_params", # Added this test case ], argnames=["model_output", "expected_tool_calls", "expected_content"], argvalues=[ @@ -422,7 +424,7 @@ fahrenheit "state": "TX", "unit": "fahrenheit" }))) - ], ""), + ], None), ('''Sure! Let me check the weather for you.<tool_call> <function=get_current_weather> <parameter=city> @@ -445,6 +447,30 @@ fahrenheit }))) ], "Sure! Let me check the weather for you."), ('''<tool_call> +<function=calculate_area> +<parameter=shape> +rectangle +</parameter> +<parameter=dimensions> +{"width": 10, + "height": 20} +</parameter> +<parameter=precision> +2 +</parameter> +</function> +</tool_call>''', [ + ToolCall(function=FunctionCall(name="calculate_area", + arguments=json.dumps({ + "shape": "rectangle", + "dimensions": { + "width": 10, + "height": 20 + }, + "precision": 2 + }))) + ], None), + ('''<tool_call> <function=get_current_weather> <parameter=city> Dallas @@ -484,13 +510,36 @@ celsius "state": "FL", "unit": "celsius" }))) - ], ""), + ], None), + # Added tool_with_typed_params test case + ('''Let me calculate that area for you.<tool_call> +<function=calculate_area> +<parameter=shape> +circle +</parameter> +<parameter=dimensions> +{"radius": 15.5} +</parameter> +<parameter=precision> +3 +</parameter> +</function> +</tool_call>''', [ + ToolCall(function=FunctionCall(name="calculate_area", + arguments=json.dumps({ + "shape": "circle", + "dimensions": { + "radius": 15.5 + }, + "precision": 3 + }))) + ], "Let me calculate that area for you."), ], ) def test_extract_tool_calls_streaming(qwen3_tool_parser, qwen3_tokenizer, sample_tools, model_output, expected_tool_calls, expected_content): - """Test incremental streaming behavior""" + """Test incremental streaming behavior including typed parameters""" request = ChatCompletionRequest(model=MODEL, messages=[], tools=sample_tools) @@ -539,7 +588,7 @@ def test_extract_tool_calls_streaming(qwen3_tool_parser, qwen3_tokenizer, "arguments"] += tool_call.function.arguments # Verify final content - assert other_content == expected_content + assert other_content == (expected_content or "") # Handle None case # Verify we got all expected tool calls assert len(tool_states) == len(expected_tool_calls) @@ -559,6 +608,125 @@ def test_extract_tool_calls_streaming(qwen3_tool_parser, qwen3_tokenizer, assert actual_args == expected_args +def test_extract_tool_calls_missing_closing_parameter_tag( + qwen3_tool_parser, sample_tools): + """Test handling of missing closing </parameter> tag""" + # Using get_current_weather from sample_tools but with malformed XML + model_output = '''Let me check the weather for you: +<tool_call> +<function=get_current_weather> +<parameter=city> +Dallas +<parameter=state> +TX +</parameter> +<parameter=unit> +fahrenheit +</parameter> +</function> +</tool_call>''' + + request = ChatCompletionRequest(model=MODEL, + messages=[], + tools=sample_tools) + extracted_tool_calls = qwen3_tool_parser.extract_tool_calls( + model_output, request=request) + + # The parser should handle the malformed XML gracefully + assert extracted_tool_calls.tools_called + assert len(extracted_tool_calls.tool_calls) == 1 + + # Verify the function name is correct + assert extracted_tool_calls.tool_calls[ + 0].function.name == "get_current_weather" + + # Verify the arguments are parsed despite the missing closing tag + args = json.loads(extracted_tool_calls.tool_calls[0].function.arguments) + assert "city" in args + assert args["city"] == "Dallas" + assert args["state"] == "TX" + assert args["unit"] == "fahrenheit" + + # Check that content before the tool call is preserved + assert "Let me check the weather for you:" in extracted_tool_calls.content + + +def test_extract_tool_calls_streaming_missing_closing_tag( + qwen3_tool_parser, qwen3_tokenizer, sample_tools): + """Test streaming with missing closing </parameter> tag""" + # Using get_current_weather from sample_tools but with malformed XML + model_output = '''Let me check the weather for you: +<tool_call> +<function=get_current_weather> +<parameter=city> +Dallas +<parameter=state> +TX +</parameter> +<parameter=unit> +fahrenheit +</parameter> +</function> +</tool_call>''' + + request = ChatCompletionRequest(model=MODEL, + messages=[], + tools=sample_tools) + + other_content = '' + tool_states = {} + + for delta_message in stream_delta_message_generator( + qwen3_tool_parser, qwen3_tokenizer, model_output, request): + + if delta_message.content: + other_content += delta_message.content + + if delta_message.tool_calls: + for tool_call in delta_message.tool_calls: + idx = tool_call.index + + if idx not in tool_states: + tool_states[idx] = { + "id": None, + "name": None, + "arguments": "", + "type": None + } + + if tool_call.id: + tool_states[idx]["id"] = tool_call.id + + if tool_call.type: + assert tool_call.type == "function" + tool_states[idx]["type"] = tool_call.type + + if tool_call.function: + if tool_call.function.name: + tool_states[idx]["name"] = tool_call.function.name + + if tool_call.function.arguments is not None: + tool_states[idx][ + "arguments"] += tool_call.function.arguments + + # Verify content was streamed + assert "Let me check the weather for you:" in other_content + + # Verify we got the tool call + assert len(tool_states) == 1 + state = tool_states[0] + assert state["id"] is not None + assert state["type"] == "function" + assert state["name"] == "get_current_weather" + + # Verify arguments were parsed correctly despite missing closing tag + assert state["arguments"] is not None + args = json.loads(state["arguments"]) + assert args["city"] == "Dallas" + assert args["state"] == "TX" + assert args["unit"] == "fahrenheit" + + def test_extract_tool_calls_streaming_incremental(qwen3_tool_parser, qwen3_tokenizer, sample_tools): diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py index 2501d6739e8f6..955813ddd3408 100644 --- a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - +import ast import json import uuid from collections.abc import Sequence @@ -22,7 +22,7 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer logger = init_logger(__name__) -@ToolParserManager.register_module(["qwen3_coder"]) +@ToolParserManager.register_module("qwen3_coder") class Qwen3CoderToolParser(ToolParser): def __init__(self, tokenizer: AnyTokenizer): @@ -30,6 +30,8 @@ class Qwen3CoderToolParser(ToolParser): self.current_tool_name_sent: bool = False self.prev_tool_call_arr: list[dict] = [] + # Override base class type - we use string IDs for tool calls + self.current_tool_id: Optional[str] = None # type: ignore self.streamed_args_for_tool: list[str] = [] # Sentinel tokens for streaming mode @@ -42,20 +44,6 @@ class Qwen3CoderToolParser(ToolParser): self.is_tool_call_started: bool = False self.failed_count: int = 0 - # Streaming state variables - self.current_tool_index: int = 0 - self.header_sent: bool = False - self.current_tool_string_id: Optional[str] = None - self.current_function_name: Optional[str] = None - self.current_param_name: Optional[str] = None - self.current_param_value: str = "" - self.param_count: int = 0 - self.in_param: bool = False - self.in_function: bool = False - self.accumulated_text: str = "" - self.json_started: bool = False - self.json_closed: bool = False - # Enhanced streaming state - reset for each new message self._reset_streaming_state() @@ -67,7 +55,8 @@ class Qwen3CoderToolParser(ToolParser): self.tool_call_function_regex = re.compile( r"<function=(.*?)</function>|<function=(.*)$", re.DOTALL) self.tool_call_parameter_regex = re.compile( - r"<parameter=(.*?)</parameter>|<parameter=(.*?)$", re.DOTALL) + r"<parameter=(.*?)(?:</parameter>|(?=<parameter=)|(?=</function>)|$)", + re.DOTALL) if not self.model_tokenizer: raise ValueError( @@ -84,8 +73,8 @@ class Qwen3CoderToolParser(ToolParser): "Qwen3 XML Tool parser could not locate tool call start/end " "tokens in the tokenizer!") - logger.debug("vLLM Successfully import tool parser %s !", - self.__class__.__name__) + logger.info("vLLM Successfully import tool parser %s !", + self.__class__.__name__) def _generate_tool_call_id(self) -> str: """Generate a unique tool call ID.""" @@ -96,7 +85,7 @@ class Qwen3CoderToolParser(ToolParser): self.current_tool_index = 0 self.is_tool_call_started = False self.header_sent = False - self.current_tool_string_id = None + self.current_tool_id = None self.current_function_name = None self.current_param_name = None self.current_param_value = "" @@ -106,122 +95,122 @@ class Qwen3CoderToolParser(ToolParser): self.accumulated_text = "" self.json_started = False self.json_closed = False + # Store accumulated parameters for type conversion + self.accumulated_params = {} + self.streaming_request = None + + def _get_arguments_config( + self, func_name: str, + tools: Optional[list[ChatCompletionToolsParam]]) -> dict: + """Extract argument configuration for a function.""" + if tools is None: + return {} + for config in tools: + if not hasattr(config, "type") or not (hasattr( + config, "function") and hasattr(config.function, "name")): + continue + if config.type == "function" and config.function.name == func_name: + if not hasattr(config.function, "parameters"): + return {} + params = config.function.parameters + if isinstance(params, dict) and "properties" in params: + return params["properties"] + elif isinstance(params, dict): + return params + else: + return {} + logger.warning("Tool '%s' is not defined in the tools list.", + func_name) + return {} + + def _convert_param_value(self, param_value: str, param_name: str, + param_config: dict, func_name: str) -> Any: + """Convert parameter value based on its type in the schema.""" + # Handle null value for any type + if param_value.lower() == "null": + return None + + if param_name not in param_config: + if param_config != {}: + logger.warning( + "Parsed parameter '%s' is not defined in the tool " + "parameters for tool '%s', directly returning the " + "string value.", param_name, func_name) + return param_value + + if isinstance(param_config[param_name], + dict) and "type" in param_config[param_name]: + param_type = str(param_config[param_name]["type"]).strip().lower() + else: + param_type = "string" + if param_type in ["string", "str", "text", "varchar", "char", "enum"]: + return param_value + elif param_type.startswith("int") or param_type.startswith( + "uint") or param_type.startswith( + "long") or param_type.startswith( + "short") or param_type.startswith("unsigned"): + try: + return int(param_value) + except (ValueError, TypeError): + logger.warning( + "Parsed value '%s' of parameter '%s' is not an " + "integer in tool '%s', degenerating to string.", + param_value, param_name, func_name) + return param_value + elif param_type.startswith("num") or param_type.startswith("float"): + try: + float_param_value = float(param_value) + return float_param_value if float_param_value - int( + float_param_value) != 0 else int(float_param_value) + except (ValueError, TypeError): + logger.warning( + "Parsed value '%s' of parameter '%s' is not a float " + "in tool '%s', degenerating to string.", param_value, + param_name, func_name) + return param_value + elif param_type in ["boolean", "bool", "binary"]: + param_value = param_value.lower() + if param_value not in ["true", "false"]: + logger.warning( + "Parsed value '%s' of parameter '%s' is not a boolean " + "(`true` or `false`) in tool '%s', degenerating to " + "false.", param_value, param_name, func_name) + return param_value == "true" + else: + if param_type in ["object", "array", "arr" + ] or param_type.startswith( + "dict") or param_type.startswith("list"): + try: + param_value = json.loads(param_value) + return param_value + except (json.JSONDecodeError, TypeError, ValueError): + logger.warning( + "Parsed value '%s' of parameter '%s' cannot be " + "parsed with json.loads in tool '%s', will try " + "other methods to parse it.", param_value, param_name, + func_name) + try: + param_value = ast.literal_eval(param_value) # safer + except (ValueError, SyntaxError, TypeError): + logger.warning( + "Parsed value '%s' of parameter '%s' cannot be " + "converted via Python `ast.literal_eval()` in tool " + "'%s', degenerating to string.", param_value, param_name, + func_name) + return param_value def _parse_xml_function_call( self, function_call_str: str, tools: Optional[list[ChatCompletionToolsParam]] ) -> Optional[ToolCall]: - def get_arguments_config(func_name: str) -> dict: - if tools is None: - return {} - for config in tools: - if not hasattr(config, "type") or not ( - hasattr(config, "function") - and hasattr(config.function, "name")): - continue - if (config.type == "function" - and config.function.name == func_name): - if not hasattr(config.function, "parameters"): - return {} - params = config.function.parameters - if isinstance(params, dict) and "properties" in params: - return params["properties"] - elif isinstance(params, dict): - return params - else: - return {} - logger.warning("Tool '%s' is not defined in the tools list.", - func_name) - return {} - - def convert_param_value(param_value: str, param_name: str, - param_config: dict, func_name: str) -> Any: - # Handle null value for any type - if param_value.lower() == "null": - return None - - converted_value: Any - - if param_name not in param_config: - if param_config != {}: - logger.warning( - "Parsed parameter '%s' is not defined in the tool " - "parameters for tool '%s', directly returning the " - "string value.", param_name, func_name) - return param_value - - if (isinstance(param_config[param_name], dict) - and "type" in param_config[param_name]): - param_type = str( - param_config[param_name]["type"]).strip().lower() - else: - param_type = "string" - if param_type in [ - "string", "str", "text", "varchar", "char", "enum" - ]: - return param_value - elif (param_type.startswith("int") or param_type.startswith("uint") - or param_type.startswith("long") - or param_type.startswith("short") - or param_type.startswith("unsigned")): - try: - converted_value = int(param_value) - return converted_value - except ValueError: - logger.warning( - "Parsed value '%s' of parameter '%s' is not an " - "integer in tool '%s', degenerating to string.", - param_value, param_name, func_name) - return param_value - elif (param_type.startswith("num") - or param_type.startswith("float")): - try: - float_param_value = float(param_value) - converted_value = (float_param_value if float_param_value - - int(float_param_value) != 0 else - int(float_param_value)) - return converted_value - except ValueError: - logger.warning( - "Parsed value '%s' of parameter '%s' is not a float " - "in tool '%s', degenerating to string.", param_value, - param_name, func_name) - return param_value - elif param_type in ["boolean", "bool", "binary"]: - param_value = param_value.lower() - if param_value not in ["true", "false"]: - logger.warning( - "Parsed value '%s' of parameter '%s' is not a " - "boolean (`true` of `false`) in tool '%s', " - "degenerating to false.", param_value, param_name, - func_name) - return param_value == "true" - else: - if param_type == "object" or param_type.startswith("dict"): - try: - converted_value = json.loads(param_value) - return converted_value - except json.JSONDecodeError: - logger.warning( - "Parsed value '%s' of parameter '%s' is not a " - "valid JSON object in tool '%s', will try other " - "methods to parse it.", param_value, param_name, - func_name) - logger.warning( - "Parameter '%s' has unknown type '%s'. " - "The value will be treated as a string.", param_name, - param_type) - return param_value - # Extract function name end_index = function_call_str.index(">") function_name = function_call_str[:end_index] - param_config = get_arguments_config(function_name) + param_config = self._get_arguments_config(function_name, tools) parameters = function_call_str[end_index + 1:] param_dict = {} - for match in self.tool_call_parameter_regex.findall(parameters): - match_text = match[0] if match[0] else match[1] + for match_text in self.tool_call_parameter_regex.findall(parameters): idx = match_text.index(">") param_name = match_text[:idx] param_value = str(match_text[idx + 1:]) @@ -231,7 +220,7 @@ class Qwen3CoderToolParser(ToolParser): if param_value.endswith("\n"): param_value = param_value[:-1] - param_dict[param_name] = convert_param_value( + param_dict[param_name] = self._convert_param_value( param_value, param_name, param_config, function_name) return ToolCall( type="function", @@ -284,8 +273,7 @@ class Qwen3CoderToolParser(ToolParser): for function_call_str in function_calls ] - # Populate prev_tool_call_arr for serving layer to set - # finish_reason + # Populate prev_tool_call_arr for serving layer to set finish_reason self.prev_tool_call_arr.clear() # Clear previous calls for tool_call in tool_calls: if tool_call: @@ -298,8 +286,8 @@ class Qwen3CoderToolParser(ToolParser): # Extract content before tool calls content_index = model_output.find(self.tool_call_start_token) - content_index = (content_index if content_index >= 0 else - model_output.find(self.tool_call_prefix)) + idx = model_output.find(self.tool_call_prefix) + content_index = content_index if content_index >= 0 else idx content = model_output[:content_index] # .rstrip() return ExtractedToolCallInformation( @@ -324,13 +312,16 @@ class Qwen3CoderToolParser(ToolParser): delta_token_ids: Sequence[int], request: ChatCompletionRequest, ) -> Union[DeltaMessage, None]: - # If no delta text, return None unless it's an EOS token after tool - # calls + # Store request for type conversion + if not previous_text: + self._reset_streaming_state() + self.streaming_request = request + + # If no delta text, return None unless it's an EOS token after tools if not delta_text: # Check if this is an EOS token after all tool calls are complete - # We check for tool calls in the text even if is_tool_call_started - # is False because it might have been reset after processing all - # tools + # Check for tool calls in text even if is_tool_call_started + # is False (might have been reset after processing all tools) if (delta_token_ids and self.tool_call_end_token_id not in delta_token_ids): # Count complete tool calls @@ -339,24 +330,19 @@ class Qwen3CoderToolParser(ToolParser): # If we have completed tool calls and populated # prev_tool_call_arr - if (complete_calls > 0 and len(self.prev_tool_call_arr) > 0): + if complete_calls > 0 and len(self.prev_tool_call_arr) > 0: # Check if all tool calls are closed - open_calls = ( - current_text.count(self.tool_call_start_token) - - current_text.count(self.tool_call_end_token)) + open_calls = current_text.count( + self.tool_call_start_token) - current_text.count( + self.tool_call_end_token) if open_calls == 0: - # Return empty delta message to allow finish_reason - # processing + # Return empty delta for finish_reason processing return DeltaMessage(content="") elif not self.is_tool_call_started and current_text: # This is a regular content response that's now complete return DeltaMessage(content="") return None - # Check if this is the first call (reset state if needed) - if not previous_text: - self._reset_streaming_state() - # Update accumulated text self.accumulated_text = current_text @@ -371,11 +357,11 @@ class Qwen3CoderToolParser(ToolParser): self.param_count = 0 self.json_started = False self.json_closed = False + self.accumulated_params = {} # Check if there are more tool calls - tool_starts_count = current_text.count( - self.tool_call_start_token) - if self.current_tool_index >= tool_starts_count: + tool_starts = current_text.count(self.tool_call_start_token) + if self.current_tool_index >= tool_starts: # No more tool calls self.is_tool_call_started = False # Continue processing next tool @@ -412,20 +398,20 @@ class Qwen3CoderToolParser(ToolParser): # We're in a tool call, find the current tool call portion # Need to find the correct tool call based on current_tool_index - tool_starts: list[int] = [] + tool_start_positions: list[int] = [] idx = 0 while True: idx = current_text.find(self.tool_call_start_token, idx) if idx == -1: break - tool_starts.append(idx) + tool_start_positions.append(idx) idx += len(self.tool_call_start_token) - if self.current_tool_index >= len(tool_starts): + if self.current_tool_index >= len(tool_start_positions): # No more tool calls to process yet return None - tool_start_idx = tool_starts[self.current_tool_index] + tool_start_idx = tool_start_positions[self.current_tool_index] # Find where this tool call ends (or current position if not ended yet) tool_end_idx = current_text.find(self.tool_call_end_token, tool_start_idx) @@ -438,19 +424,19 @@ class Qwen3CoderToolParser(ToolParser): # Looking for function header if not self.header_sent: if self.tool_call_prefix in tool_text: - func_start = (tool_text.find(self.tool_call_prefix) + - len(self.tool_call_prefix)) + func_start = tool_text.find(self.tool_call_prefix) + len( + self.tool_call_prefix) func_end = tool_text.find(">", func_start) if func_end != -1: # Found complete function name self.current_function_name = tool_text[func_start:func_end] - self.current_tool_string_id = self._generate_tool_call_id() + self.current_tool_id = self._generate_tool_call_id() self.header_sent = True self.in_function = True - # IMPORTANT: Add to prev_tool_call_arr immediately when we - # detect a tool call. This ensures + # IMPORTANT: Add to prev_tool_call_arr immediately when + # we detect a tool call. This ensures # finish_reason="tool_calls" even if parsing isn't complete already_added = any( tool.get("name") == self.current_function_name @@ -466,7 +452,7 @@ class Qwen3CoderToolParser(ToolParser): return DeltaMessage(tool_calls=[ DeltaToolCall( index=self.current_tool_index, - id=self.current_tool_string_id, + id=self.current_tool_id, function=DeltaFunctionCall( name=self.current_function_name, arguments=""), type="function", @@ -496,10 +482,11 @@ class Qwen3CoderToolParser(ToolParser): # Close JSON self.json_closed = True - # Extract the complete tool call to update prev_tool_call_arr - # with final arguments. Find the function content - func_start = (tool_text.find(self.tool_call_prefix) + - len(self.tool_call_prefix)) + # Extract complete tool call to update + # prev_tool_call_arr with final arguments + # Find the function content + func_start = tool_text.find(self.tool_call_prefix) + len( + self.tool_call_prefix) func_content_end = tool_text.find(self.function_end_token, func_start) if func_content_end != -1: @@ -507,15 +494,17 @@ class Qwen3CoderToolParser(ToolParser): # Parse to get the complete arguments try: parsed_tool = self._parse_xml_function_call( - func_content, request.tools if request else None) + func_content, self.streaming_request.tools + if self.streaming_request else None) if parsed_tool: - # Update existing entry in prev_tool_call_arr with - # complete arguments + # Update existing entry in + # prev_tool_call_arr with complete args for i, tool in enumerate(self.prev_tool_call_arr): - if (tool.get("name") == - parsed_tool.function.name): - self.prev_tool_call_arr[i]["arguments"] = ( - parsed_tool.function.arguments) + if tool.get( + "name") == parsed_tool.function.name: + args = parsed_tool.function.arguments + self.prev_tool_call_arr[i][ + "arguments"] = args break except Exception: pass # Ignore parsing errors during streaming @@ -530,73 +519,110 @@ class Qwen3CoderToolParser(ToolParser): # Reset state for next tool self.in_function = False self.json_closed = True + self.accumulated_params = {} return result # Look for parameters - # Count how many complete parameters we have processed - complete_params = tool_text.count(self.parameter_end_token) + # Find all parameter starts + param_starts = [] + idx = 0 + while True: + idx = tool_text.find(self.parameter_prefix, idx) + if idx == -1: + break + param_starts.append(idx) + idx += len(self.parameter_prefix) # Check if we should start a new parameter - if not self.in_param and self.param_count < complete_params: - # Find the unprocessed parameter - # Count parameter starts - param_starts = [] - idx = 0 - while True: - idx = tool_text.find(self.parameter_prefix, idx) - if idx == -1: - break - param_starts.append(idx) - idx += len(self.parameter_prefix) + if (not self.in_param and self.param_count < len(param_starts) + and len(param_starts) > self.param_count): + # Process the next parameter + param_idx = param_starts[self.param_count] + param_start = param_idx + len(self.parameter_prefix) + remaining = tool_text[param_start:] - if len(param_starts) > self.param_count: - # Process the next parameter - param_idx = param_starts[self.param_count] - param_start = param_idx + len(self.parameter_prefix) - remaining = tool_text[param_start:] + if ">" in remaining: + # We have the complete parameter name + name_end = remaining.find(">") + self.current_param_name = remaining[:name_end] - if ">" in remaining: - # We have the complete parameter name - name_end = remaining.find(">") - self.current_param_name = remaining[:name_end] + # Find the parameter value + value_start = param_start + name_end + 1 + value_text = tool_text[value_start:] + if value_text.startswith("\n"): + value_text = value_text[1:] - # Find the parameter value - value_start = param_start + name_end + 1 - value_text = tool_text[value_start:] - if value_text.startswith("\n"): - value_text = value_text[1:] + # Find where this parameter ends + param_end_idx = value_text.find(self.parameter_end_token) + if param_end_idx == -1: + # No closing tag, look for next parameter or + # function end + next_param_idx = value_text.find(self.parameter_prefix) + func_end_idx = value_text.find(self.function_end_token) - # Find where this parameter ends - param_end_idx = value_text.find( - self.parameter_end_token) - if param_end_idx != -1: - # Complete parameter found - param_value = value_text[:param_end_idx] - if param_value.endswith("\n"): - param_value = param_value[:-1] - - # Build complete JSON fragment for this parameter - if self.param_count == 0: - json_fragment = ( - '"' + self.current_param_name + '": "' + - json.dumps(param_value)[1:-1] + '"') + if next_param_idx != -1 and (func_end_idx == -1 + or next_param_idx + < func_end_idx): + param_end_idx = next_param_idx + elif func_end_idx != -1: + param_end_idx = func_end_idx + else: + # Neither found, check if tool call is complete + if self.tool_call_end_token in tool_text: + # Tool call is complete, so parameter + # must be complete too. Use all + # remaining text before function end + param_end_idx = len(value_text) else: - json_fragment = ( - ', "' + self.current_param_name + '": "' + - json.dumps(param_value)[1:-1] + '"') + # Still streaming, wait for more content + return None - self.param_count += 1 + if param_end_idx != -1: + # Complete parameter found + param_value = value_text[:param_end_idx] + if param_value.endswith("\n"): + param_value = param_value[:-1] - return DeltaMessage(tool_calls=[ - DeltaToolCall( - index=self.current_tool_index, - function=DeltaFunctionCall( - arguments=json_fragment), - ) - ]) + # Store raw value for later processing + self.accumulated_params[ + self.current_param_name] = param_value - # Continue parameter value + # Get parameter configuration for type conversion + param_config = self._get_arguments_config( + self.current_function_name or "", + self.streaming_request.tools + if self.streaming_request else None) + + # Convert param value to appropriate type + converted_value = self._convert_param_value( + param_value, self.current_param_name, param_config, + self.current_function_name or "") + + # Build JSON fragment based on the converted type + # Use json.dumps to properly serialize the value + serialized_value = json.dumps(converted_value, + ensure_ascii=False) + + if self.param_count == 0: + json_fragment = (f'"{self.current_param_name}": ' + f'{serialized_value}') + else: + json_fragment = (f', "{self.current_param_name}": ' + f'{serialized_value}') + + self.param_count += 1 + + return DeltaMessage(tool_calls=[ + DeltaToolCall( + index=self.current_tool_index, + function=DeltaFunctionCall( + arguments=json_fragment), + ) + ]) + + # Continue parameter value - Not used in the current implementation + # since we process complete parameters above if self.in_param: if self.parameter_end_token in delta_text: # End of parameter @@ -608,25 +634,42 @@ class Qwen3CoderToolParser(ToolParser): gt_idx = value_chunk.find(">") value_chunk = value_chunk[gt_idx + 1:] - if (not self.current_param_value - and value_chunk.startswith("\n")): + if not self.current_param_value and value_chunk.startswith( + "\n"): value_chunk = value_chunk[1:] - # Calculate incremental JSON + # Store complete value full_value = self.current_param_value + value_chunk - prev_escaped = (json.dumps(self.current_param_value)[1:-1] - if self.current_param_value else "") - full_escaped = json.dumps(full_value)[1:-1] - delta_escaped = full_escaped[len(prev_escaped):] + self.accumulated_params[ + self.current_param_name] = full_value + # Get parameter configuration for type conversion + param_config = self._get_arguments_config( + self.current_function_name or "", + self.streaming_request.tools + if self.streaming_request else None) + + # Convert the parameter value to the appropriate type + converted_value = self._convert_param_value( + full_value, self.current_param_name or "", + param_config, self.current_function_name or "") + + # Serialize the converted value + serialized_value = json.dumps(converted_value, + ensure_ascii=False) + + # Since we've been streaming the quoted version, + # we need to close it properly + # This is complex - for now just complete the value self.in_param = False self.current_param_value = "" + # Just close the current parameter string return DeltaMessage(tool_calls=[ DeltaToolCall( index=self.current_tool_index, function=DeltaFunctionCall( - arguments=delta_escaped + '"'), + arguments='"'), # Close the string quote ) ]) else: @@ -638,18 +681,18 @@ class Qwen3CoderToolParser(ToolParser): gt_idx = value_chunk.find(">") value_chunk = value_chunk[gt_idx + 1:] - if (not self.current_param_value - and value_chunk.startswith("\n")): + if not self.current_param_value and value_chunk.startswith( + "\n"): value_chunk = value_chunk[1:] if value_chunk: # Stream the escaped delta - prev_escaped = (json.dumps( - self.current_param_value)[1:-1] - if self.current_param_value else "") + prev_escaped = json.dumps( + self.current_param_value, ensure_ascii=False + )[1:-1] if self.current_param_value else "" self.current_param_value += value_chunk - full_escaped = json.dumps( - self.current_param_value)[1:-1] + full_escaped = json.dumps(self.current_param_value, + ensure_ascii=False)[1:-1] delta_escaped = full_escaped[len(prev_escaped):] if delta_escaped: @@ -661,4 +704,4 @@ class Qwen3CoderToolParser(ToolParser): ) ]) - return None + return None \ No newline at end of file From c905684cfeaee3b2be2c736eee473b2c6ae7f7bf Mon Sep 17 00:00:00 2001 From: Chenheli Hua <huachenheli@outlook.com> Date: Tue, 26 Aug 2025 20:05:34 -0700 Subject: [PATCH 065/112] [Core] Asynchronous h2d in merge_multimodal_embeddings via pinned memory. (#23686) Signed-off-by: Chenheli Hua <huachenheli@outlook.com> Co-authored-by: Roger Wang <hey@rogerw.io> --- vllm/model_executor/models/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 6c27fedc61b17..11e098f1d7bdb 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -508,7 +508,9 @@ def merge_multimodal_embeddings( """ if isinstance(placeholder_token_id, list): placeholder_token_id = torch.tensor(placeholder_token_id, - device=input_ids.device) + pin_memory=True).to( + device=input_ids.device, + non_blocking=True) return _merge_multimodal_embeddings( inputs_embeds, torch.isin(input_ids, placeholder_token_id), From 644d57d53191b94d9e50a4765891c498790d924b Mon Sep 17 00:00:00 2001 From: CSWYF3634076 <58356743+CSWYF3634076@users.noreply.github.com> Date: Wed, 27 Aug 2025 12:02:55 +0800 Subject: [PATCH 066/112] [Model] Add Ernie4.5 VL Model Support (#22514) Signed-off-by: wangyafeng <wangyafeng@baidu.com> --- docs/models/supported_models.md | 1 + examples/offline_inference/vision_language.py | 32 + requirements/test.in | 1 + requirements/test.txt | 3 + .../multimodal/processing/test_common.py | 1 + tests/models/registry.py | 2 + .../rotary_embedding/ernie45_vl_rope.py | 72 + .../layers/rotary_embedding/mrope.py | 123 ++ vllm/model_executor/models/ernie45_vl.py | 1504 +++++++++++++++++ vllm/model_executor/models/ernie45_vl_moe.py | 723 ++++++++ vllm/model_executor/models/registry.py | 1 + 11 files changed, 2463 insertions(+) create mode 100644 vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py create mode 100644 vllm/model_executor/models/ernie45_vl.py create mode 100644 vllm/model_executor/models/ernie45_vl_moe.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 74f3a9d1cdb56..19ce8c06724f4 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -616,6 +616,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I<sup>+</sup> | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ | ✅︎ | | `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ | ✅︎ | | `DonutForConditionalGeneration`<sup>^</sup> | Donut | T + I | `ByteDance/Dolphin`, `naver-clova-ix/donut-base-finetuned-docvqa`, etc. | | | | +| `Ernie4_5_VLMoeForConditionalGeneration` | Ernie4.5-VL | T + I<sup>+</sup>/ V<sup>+</sup> | `baidu/ERNIE-4.5-VL-28B-A3B-PT`, `baidu/ERNIE-4.5-VL-424B-A47B-PT` | | ✅︎ | ✅︎ | | `Florence2ForConditionalGeneration` | Florence-2 | T + I | `microsoft/Florence-2-base`, `microsoft/Florence-2-large`, etc. | | | | | `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ | ✅︎ | | `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ | diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 8d97ba2668263..4e879666f61d7 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -173,6 +173,37 @@ def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData: ) +# Ernie4.5-VL +def run_ernie45_vl(questions: list[str], modality: str) -> ModelRequestData: + model_name = "baidu/ERNIE-4.5-VL-28B-A3B-PT" + + engine_args = EngineArgs( + model=model_name, + max_model_len=4096, + max_num_seqs=5, + limit_mm_per_prompt={modality: 1}, + trust_remote_code=True, + ) + + if modality == "image": + placeholder = "Picture 1:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>" + elif modality == "video": + placeholder = "Video 1:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>" + + prompts = [ + ( + f"<|begin_of_sentence|>User: {question}{placeholder}\n" + "Assistant: <think></think>" + ) + for question in questions + ] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + # Florence2 def run_florence2(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -1602,6 +1633,7 @@ model_example_map = { "chameleon": run_chameleon, "command_a_vision": run_command_a_vision, "deepseek_vl_v2": run_deepseek_vl2, + "ernie45_vl": run_ernie45_vl, "florence2": run_florence2, "fuyu": run_fuyu, "gemma3": run_gemma3, diff --git a/requirements/test.in b/requirements/test.in index 098a9242bc3af..92c577c501632 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -54,3 +54,4 @@ runai-model-streamer-s3==0.11.0 fastsafetensors>=0.1.10 pydantic>=2.10 # 2.9 leads to error on python 3.10 terratorch==1.1rc2 # required for PrithviMAE test +decord==0.6.0 diff --git a/requirements/test.txt b/requirements/test.txt index 8b872752d875c..0c27c9bb67e82 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -156,6 +156,8 @@ datasets==3.0.2 # mteb decorator==5.1.1 # via librosa +decord==0.6.0 + # via -r requirements/test.in dill==0.3.8 # via # datasets @@ -493,6 +495,7 @@ numpy==1.26.4 # contourpy # cupy-cuda12x # datasets + # decord # einx # encodec # evaluate diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 74ca10d32609a..6361cb9b5586a 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -272,6 +272,7 @@ def _test_processing_correctness_one( "CohereLabs/command-a-vision-07-2025", "deepseek-ai/deepseek-vl2-tiny", "naver-clova-ix/donut-base-finetuned-docvqa", + "baidu/ERNIE-4.5-VL-28B-A3B-PT", "microsoft/Florence-2-base", "adept/fuyu-8b", "google/gemma-3-4b-it", diff --git a/tests/models/registry.py b/tests/models/registry.py index 20c7c3af67764..f2c09d3e8452a 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -396,6 +396,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { transformers_version_reason="HF model is not compatible.", # noqa: E501 hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}), # noqa: E501 "Emu3ForConditionalGeneration": _HfExamplesInfo("BAAI/Emu3-Chat-hf"), + "Ernie4_5_VLMoeForConditionalGeneration": _HfExamplesInfo("baidu/ERNIE-4.5-VL-28B-A3B-PT", # noqa: E501 + trust_remote_code=True), "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"), "Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it"), "Gemma3nForConditionalGeneration": _HfExamplesInfo("google/gemma-3n-E2B-it", # noqa: E501 diff --git a/vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py b/vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py new file mode 100644 index 0000000000000..05322e56f2620 --- /dev/null +++ b/vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py @@ -0,0 +1,72 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import Optional + +import torch + +from .common import apply_rotary_emb_dispatch +from .mrope import MRotaryEmbedding + + +class Ernie4_5_VLRotaryEmbedding(MRotaryEmbedding): + """3D rotary positional embedding. 3D is t:time h:height w:width""" + + def forward( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + assert positions.ndim == 1 or positions.ndim == 2 + assert key is not None + + num_tokens = positions.shape[-1] + cos_sin = self.cos_sin_cache[positions] + cos, sin = cos_sin.chunk(2, dim=-1) + if positions.ndim == 2: + assert self.mrope_section + + section_h = self.mrope_section[0] # 22 + section_w = self.mrope_section[1] # 22 + section_t = self.mrope_section[2] # 20 + assert section_h == section_w + # Split according to [h w h w h w h w... t t t...] + section_cos_t = cos[..., -section_t:] + section_cos_h = cos[..., :section_h + section_w:2] + section_cos_w = cos[..., 1:section_h + section_w:2] + + cos_t, cos_h, cos_w = section_cos_t[0], section_cos_h[ + 1], section_cos_w[2] + cos_hw = torch.stack([cos_h, cos_w], + dim=-1).reshape(cos_h.shape[:-1] + + (cos_h.shape[-1] * 2, )) + cos = torch.cat([cos_hw, cos_t], dim=-1) + + section_sin_t = sin[..., -section_t:] + section_sin_h = sin[..., :section_h + section_w:2] + section_sin_w = sin[..., 1:section_h + section_w:2] + + sin_t, sin_h, sin_w = section_sin_t[0], section_sin_h[ + 1], section_sin_w[2] + sin_hw = torch.stack([sin_h, sin_w], + dim=-1).reshape(sin_h.shape[:-1] + + (sin_h.shape[-1] * 2, )) + sin = torch.cat([sin_hw, sin_t], dim=-1) + + query_shape = query.shape + query = query.view(num_tokens, -1, self.head_size) + query_rot = query[..., :self.rotary_dim] + query_pass = query[..., self.rotary_dim:] + query_rot = apply_rotary_emb_dispatch(query_rot, cos, sin, + self.is_neox_style) + query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + + key_shape = key.shape + key = key.view(num_tokens, -1, self.head_size) + key_rot = key[..., :self.rotary_dim] + key_pass = key[..., self.rotary_dim:] + key_rot = apply_rotary_emb_dispatch(key_rot, cos, sin, + self.is_neox_style) + key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + return query, key diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py index a091cfb743291..e374aa9bebf9e 100644 --- a/vllm/model_executor/layers/rotary_embedding/mrope.py +++ b/vllm/model_executor/layers/rotary_embedding/mrope.py @@ -393,6 +393,15 @@ class MRotaryEmbedding(RotaryEmbedding): context_len=context_len, seq_len=seq_len, ) + elif hf_config.model_type in ["ernie4_5_moe_vl", "ernie4_5_vl"]: + return cls._ernie_get_input_positions_tensor( + input_tokens=input_tokens, + hf_config=hf_config, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + context_len=context_len, + seq_len=seq_len, + ) else: return cls._vl_get_input_positions_tensor( input_tokens=input_tokens, @@ -513,6 +522,120 @@ class MRotaryEmbedding(RotaryEmbedding): len(input_tokens)).item() return llm_positions, mrope_position_delta + @classmethod + def _ernie_get_input_positions_tensor( + cls, + input_tokens: list[int], + hf_config: PretrainedConfig, + image_grid_thw: Union[list[list[int]], torch.Tensor], + video_grid_thw: Union[list[list[int]], torch.Tensor], + context_len: int = 0, + seq_len: Optional[int] = None, + ) -> tuple[torch.Tensor, int]: + """Get mrope input positions and delta value for Ernie VL.""" + + image_token_id = hf_config.im_patch_id + video_start_token_id = hf_config.video_start_token_id + video_end_token_id = hf_config.video_end_token_id + spatial_conv_size = hf_config.spatial_conv_size + temporal_conv_size = hf_config.temporal_conv_size + llm_pos_ids_list: list = [] + + if not (image_grid_thw is None and video_grid_thw is None): + if isinstance(image_grid_thw, torch.Tensor): + image_grid_thw = image_grid_thw.tolist() + + input_token_type: list[str] = [] + video_check_flg = False + for token in input_tokens: + if token == video_start_token_id: + video_check_flg = True + elif token == video_end_token_id: + video_check_flg = False + + if (token == image_token_id) and (video_check_flg is False): + input_token_type.append("image") + elif (token == image_token_id) and (video_check_flg is True): + input_token_type.append("video") + else: + input_token_type.append("text") + + input_type_group: list[tuple[str, int, int]] = [] + for key, group_iter in itertools.groupby( + enumerate(input_token_type), lambda x: x[1]): + group_list = list(group_iter) + start_index = group_list[0][0] + end_index = group_list[-1][0] + 1 + input_type_group.append((key, start_index, end_index)) + + video_frame_num = 1 + mm_data_idx = 0 + for modality_type, start_idx, end_idx in input_type_group: + st_idx = llm_pos_ids_list[-1].max() + 1 if len( + llm_pos_ids_list) > 0 else 0 + if modality_type == "image": + t, h, w = ( + image_grid_thw[mm_data_idx][0], + image_grid_thw[mm_data_idx][1], + image_grid_thw[mm_data_idx][2], + ) + llm_grid_t, llm_grid_h, llm_grid_w = \ + t, h // spatial_conv_size, w // spatial_conv_size + + t_index = torch.arange(llm_grid_t).view(-1, 1).expand( + -1, llm_grid_h * llm_grid_w).flatten() + h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand( + llm_grid_t, -1, llm_grid_w).flatten() + w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand( + llm_grid_t, llm_grid_h, -1).flatten() + llm_pos_ids_list.append( + torch.stack([t_index, h_index, w_index]) + st_idx) + mm_data_idx += 1 + + elif modality_type == "video": + t, h, w = ( + video_grid_thw[mm_data_idx][0], + video_grid_thw[mm_data_idx][1], + video_grid_thw[mm_data_idx][2], + ) + llm_grid_t, llm_grid_h, llm_grid_w = (t // + temporal_conv_size, + h // + spatial_conv_size, + w // + spatial_conv_size) + + for t_idx in range(llm_grid_t): + t_index = torch.tensor(t_idx).view(-1, 1).expand( + -1, llm_grid_h * llm_grid_w).flatten() + h_index = torch.arange(llm_grid_h).view( + 1, -1, 1).expand(1, -1, llm_grid_w).flatten() + w_index = torch.arange(llm_grid_w).view( + 1, 1, -1).expand(1, llm_grid_h, -1).flatten() + llm_pos_ids_list.append( + torch.stack([t_index, h_index, w_index]) + st_idx) + + mm_data_idx += 1 + video_frame_num += 1 + + else: + text_len = end_idx - start_idx + llm_pos_ids_list.append( + torch.arange(text_len).view(1, -1).expand(3, -1) + + st_idx) + video_frame_num = 1 + + else: + text_len = len(input_tokens) + llm_pos_ids_list.append( + torch.arange(text_len).view(1, -1).expand(3, -1)) + + llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) + llm_positions = llm_positions[:, context_len:seq_len] + mrope_position_delta = (llm_positions.max() + 1 - + len(input_tokens)).item() + return llm_positions, mrope_position_delta + @classmethod def _vl_get_input_positions_tensor( cls, diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py new file mode 100644 index 0000000000000..d880fc434e20f --- /dev/null +++ b/vllm/model_executor/models/ernie45_vl.py @@ -0,0 +1,1504 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright 2025 The Baidu team. +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only Erine VL model compatible with HuggingFace weights.""" +import math +from collections.abc import Iterable, Mapping, Sequence +from functools import partial +from typing import Any, Callable, Literal, Optional, TypedDict, Union + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange, repeat +from transformers import BatchFeature + +from vllm.config import VllmConfig +from vllm.distributed import parallel_state +from vllm.distributed import utils as dist_utils +from vllm.logger import init_logger +from vllm.model_executor import SamplingMetadata +from vllm.model_executor.layers.activation import QuickGELU +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalKwargsItems) +from vllm.multimodal.parse import ImageSize, MultiModalDataItems +from vllm.multimodal.processing import (BaseMultiModalProcessor, + BaseProcessingInfo, PromptReplacement, + PromptUpdate) +from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.platforms import _Backend, current_platform +from vllm.sequence import IntermediateTensors + +from .ernie45_vl_moe import Ernie4_5_VLMoeForCausalLM +from .interfaces import (MultiModalEmbeddings, SupportsLoRA, + SupportsMultiModal, SupportsPP) +from .utils import (AutoWeightsLoader, WeightsMapper, maybe_prefix, + merge_multimodal_embeddings) +from .vision import get_vit_attn_backend + +logger = init_logger(__name__) + +_MAX_FRAMES_PER_VIDEO = 16 + +# === Vision Transformer === # + + +def rotate_half(x: torch.Tensor, interleaved: bool = False) -> torch.Tensor: + if not interleaved: + x1, x2 = x.chunk(2, dim=-1) + return torch.cat((-x2, x1), dim=-1) + else: + x1, x2 = x[..., ::2], x[..., 1::2] + return rearrange(torch.stack((-x2, x1), dim=-1), + "... d two -> ... (d two)", + two=2) + + +def apply_rotary_emb_torch(x: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + interleaved: bool = False) -> torch.Tensor: + """ + x: (batch_size, seqlen, nheads, headdim) + cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2) + """ + ro_dim = cos.shape[-1] * 2 + assert ro_dim <= x.shape[-1] + cos = repeat( + cos, + "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)") + sin = repeat( + sin, + "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)") + return torch.cat( + [ + x[..., :ro_dim] * cos + + rotate_half(x[..., :ro_dim], interleaved) * sin, x[..., ro_dim:] + ], + dim=-1, + ) + + +def apply_rotary_pos_emb_vision(t: torch.Tensor, + freqs: torch.Tensor) -> torch.Tensor: + t_ = t.float() + cos = freqs.cos() + sin = freqs.sin() + apply_rotary_emb = apply_rotary_emb_torch + if current_platform.is_cuda(): + from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb + output = apply_rotary_emb(t_, cos, sin).type_as(t) + return output + + +def all_gather_interleave(local_tensor, hidden_size: int, tp_size: int): + """All-gather the input tensor interleavely across model parallel group.""" + import torch.distributed as dist + gathered_tensors = [torch.zeros_like(local_tensor) for _ in range(tp_size)] + dist.all_gather(gathered_tensors, + local_tensor, + group=parallel_state.get_tp_group().device_group) + + gathered_tensors_split = [ + torch.split(tensor, hidden_size // tp_size, -1) + for tensor in gathered_tensors + ] + ordered_tensors = [ + tensor for pair in zip(*gathered_tensors_split) for tensor in pair + ] + result_tensor = torch.cat(ordered_tensors, dim=-1) + return result_tensor + + +class Ernie4_5_VisionAttention(nn.Module): + """VisionAttention using VLLM framework APIs""" + + def __init__( + self, + embed_dim: int, + num_heads: int, + projection_size: int, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + # Per attention head and per partition values. + self.tp_size = parallel_state.get_tensor_model_parallel_world_size() + self.tp_rank = parallel_state.get_tensor_model_parallel_rank() + self.hidden_size_per_attention_head = dist_utils.divide( + projection_size, num_heads) + self.num_attention_heads_per_partition = dist_utils.divide( + num_heads, self.tp_size) + + self.qkv = QKVParallelLinear( + hidden_size=embed_dim, + head_size=self.hidden_size_per_attention_head, + total_num_heads=num_heads, + total_num_kv_heads=num_heads, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.qkv") + self.proj = RowParallelLinear(input_size=projection_size, + output_size=embed_dim, + quant_config=quant_config, + prefix=f"{prefix}.proj") + + # Detect attention implementation. + self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True) + if self.attn_backend not in { + _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS, + _Backend.ROCM_AITER_FA + }: + raise RuntimeError( + f"Ernie45-VL does not support {self.attn_backend} backend now." + ) + self.is_flash_attn_backend = self.attn_backend in { + _Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA + } + + def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]: + # [s, b, 3 * head * head_dim] + seq_len, bs, _ = qkv.shape + if self.tp_size > 1: + qkv = all_gather_interleave(qkv, self.qkv.hidden_size, + self.tp_size) + + # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim] + q, k, v = qkv.chunk(3, dim=2) + + # 3 * [s, b, head * head_dim] + if self.tp_size > 1: + splitter = partial(dist_utils.split_tensor_along_last_dim, + num_partitions=self.tp_size) + q = splitter(q)[self.tp_rank] + k = splitter(k)[self.tp_rank] + v = splitter(v)[self.tp_rank] + + # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim] + new_shape = (seq_len, bs, self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head) + q, k, v = (x.view(*new_shape) for x in (q, k, v)) + return q, k, v + + def forward( + self, + x: torch.Tensor, + cu_seqlens: torch.Tensor, + rotary_pos_emb: torch.Tensor, + max_seqlen: Optional[int] = None, # Only used for Flash Attention + seqlens: Optional[list[int]] = None, # Only used for xFormers + ) -> torch.Tensor: + # [s, b, c] --> [s, b, head * 3 * head_dim] + x, _ = self.qkv(x) + + # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim] + q, k, v = self.split_qkv(x) + batch_size = q.shape[1] + + q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() + for x in (q, k, v)) + if rotary_pos_emb is not None: + q = apply_rotary_pos_emb_vision(q, rotary_pos_emb) + k = apply_rotary_pos_emb_vision(k, rotary_pos_emb) + + if self.is_flash_attn_backend: + # from vllm_flash_attn.flash_attn_interface import ( + # flash_attn_varlen_func) + if self.attn_backend == _Backend.ROCM_AITER_FA: + from aiter import flash_attn_varlen_func + else: + from flash_attn import flash_attn_varlen_func + + q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) + + output = flash_attn_varlen_func(q, + k, + v, + cu_seqlens_q=cu_seqlens, + cu_seqlens_k=cu_seqlens, + max_seqlen_q=max_seqlen, + max_seqlen_k=max_seqlen, + dropout_p=0.0, + causal=False) + + context_layer = rearrange(output, + "(b s) ... -> b s ...", + b=batch_size) + elif self.attn_backend == _Backend.TORCH_SDPA: + # Execute attention entry by entry for speed & less VRAM. + outputs = [] + for i in range(1, len(cu_seqlens)): + start_idx = cu_seqlens[i - 1] + end_idx = cu_seqlens[i] + q_i = q[:, start_idx:end_idx] + k_i = k[:, start_idx:end_idx] + v_i = v[:, start_idx:end_idx] + q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d") + for x in [q_i, k_i, v_i]) + output_i = F.scaled_dot_product_attention(q_i, + k_i, + v_i, + dropout_p=0.0) + output_i = rearrange(output_i, "b h s d -> b s h d ") + outputs.append(output_i) + context_layer = torch.cat(outputs, dim=1) + elif self.attn_backend == _Backend.XFORMERS: + from xformers import ops as xops + from xformers.ops.fmha.attn_bias import BlockDiagonalMask + + attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens, + kv_seqlen=None, + device=q.device) + + context_layer = xops.memory_efficient_attention_forward( + q, k, v, attn_bias=attn_bias, p=0, scale=None) + context_layer = rearrange(context_layer, + "b s h d -> s b (h d)").contiguous() + + output, _ = self.proj(context_layer) + return output + + +class Ernie4_5_VisionMLP(nn.Module): + + def __init__( + self, + in_features: int, + hidden_features: int, + act_layer: type[nn.Module] = QuickGELU, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + super().__init__() + self.fc1 = ColumnParallelLinear(in_features, + hidden_features, + quant_config=quant_config, + prefix=f"{prefix}.fc1") + self.act = act_layer() + self.fc2 = RowParallelLinear(hidden_features, + in_features, + quant_config=quant_config, + prefix=f"{prefix}.fc2") + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x_parallel, _ = self.fc1(x) + x_parallel = self.act(x_parallel) + x, _ = self.fc2(x_parallel) + return x + + +class Ernie4_5_VisionBlock(nn.Module): + + def __init__( + self, + dim: int, + num_heads: int, + mlp_ratio: float, + act_layer: type[nn.Module] = QuickGELU, + norm_layer: Optional[Callable[[int], nn.Module]] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + + if norm_layer is None: + norm_layer = partial(nn.LayerNorm, eps=1e-6) + self.norm1 = norm_layer(dim) + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + + self.attn = Ernie4_5_VisionAttention(embed_dim=dim, + num_heads=num_heads, + projection_size=dim, + quant_config=quant_config, + prefix=f"{prefix}.attn") + + self.mlp = Ernie4_5_VisionMLP(dim, + mlp_hidden_dim, + act_layer=act_layer, + quant_config=quant_config, + prefix=f"{prefix}.mlp") + + def forward( + self, + hidden_states: torch.Tensor, + cu_seqlens: torch.Tensor, + rotary_pos_emb: torch.Tensor, + max_seqlen: Optional[int] = None, # Only used for Flash Attention + seqlens: Optional[list[int]] = None, # Only used for xFormers + ) -> torch.Tensor: + + hidden_states = hidden_states + self.attn( + self.norm1(hidden_states), + cu_seqlens=cu_seqlens, + rotary_pos_emb=rotary_pos_emb, + max_seqlen=max_seqlen, + seqlens=seqlens, + ) + hidden_states = hidden_states + self.mlp(self.norm2(hidden_states)) + return hidden_states + + +class Ernie4_5_VisionPatchEmbed(nn.Module): + + def __init__( + self, + patch_size: int = 14, + in_channels: int = 3, + embed_dim: int = 1280, + prefix="", + ) -> None: + + super().__init__() + self.patch_size = patch_size + self.in_channels = in_channels + self.embed_dim = embed_dim + + self.proj = nn.Linear(in_channels * patch_size * patch_size, + embed_dim, + bias=False) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + + target_dtype = self.proj.weight.dtype + hidden_states = hidden_states.to(target_dtype) + hidden_states = self.proj(hidden_states) + + return hidden_states + + +class Ernie4_5_VisionRotaryEmbedding(nn.Module): + + def __init__(self, dim: int, theta: float = 10000.0) -> None: + super().__init__() + self.inv_freq = 1.0 / theta**( + torch.arange(start=0, end=dim, step=2, dtype=torch.float32) / dim) + + def forward(self, seqlen: int) -> torch.Tensor: + seq = torch.arange(seqlen, + device=self.inv_freq.device, + dtype=self.inv_freq.dtype) + freqs = torch.outer(input=seq, vec2=self.inv_freq) + return freqs + + +class Ernie4_5_VisionTransformer(nn.Module): + + def __init__( + self, + vision_config, + norm_eps: float = 1e-6, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + + super().__init__() + patch_size = vision_config.patch_size + spatial_merge_size = vision_config.spatial_merge_size + in_channels = vision_config.in_channels + hidden_size = vision_config.hidden_size + embed_dim = vision_config.embed_dim + depth = vision_config.depth + num_heads = vision_config.num_heads + mlp_ratio = vision_config.mlp_ratio + + self.spatial_merge_size = spatial_merge_size + self.num_heads = num_heads + self.embed_dim = embed_dim + + self.patch_embed = Ernie4_5_VisionPatchEmbed( + patch_size=patch_size, + in_channels=in_channels, + embed_dim=embed_dim, + prefix=f"{prefix}.patch_embed", + ) + + norm_layer = partial(nn.LayerNorm, eps=norm_eps) + head_dim = embed_dim // num_heads + self.rotary_pos_emb = Ernie4_5_VisionRotaryEmbedding(head_dim // 2) + + self.blocks = nn.ModuleList([ + Ernie4_5_VisionBlock(dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + norm_layer=norm_layer, + quant_config=quant_config, + prefix=f"{prefix}.blocks.{layer_idx}") + for layer_idx in range(depth) + ]) + + assert (hidden_size == embed_dim + ), "vit's config.hidden must be equal to config.embed_dim" + self.ln = nn.LayerNorm(hidden_size, eps=1e-6) + + self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True) + + @property + def dtype(self) -> torch.dtype: + return self.patch_embed.proj.weight.dtype + + @property + def device(self) -> torch.device: + return self.patch_embed.proj.weight.device + + def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor: + pos_ids = [] + for t, h, w in grid_thw: + hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) + wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) + hpos_ids = hpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ).permute(0, 2, 1, 3).flatten() + wpos_ids = wpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ).permute(0, 2, 1, 3).flatten() + pos_ids.append( + torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) + pos_ids = torch.cat(pos_ids, dim=0) + max_grid_size = grid_thw[:, 1:].max() + rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) + rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) + return rotary_pos_emb + + def compute_attn_mask_seqlen( + self, cu_seqlens: torch.Tensor + ) -> tuple[Optional[int], Optional[list[int]]]: + max_seqlen, seqlens = None, None + if self.attn_backend == _Backend.FLASH_ATTN: + max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() + elif self.attn_backend == _Backend.XFORMERS: + seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() + return max_seqlen, seqlens + + def forward(self, + hidden_states: torch.Tensor, + grid_thw: torch.Tensor, + num_pad=0) -> torch.Tensor: + + hidden_states = self.patch_embed(hidden_states) + + rotary_pos_emb = self.rot_pos_emb(grid_thw) + rotary_pos_emb = rotary_pos_emb.to(hidden_states.device) + + cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], + grid_thw[:, 0]).cumsum( + dim=0, dtype=torch.int32) + + if num_pad > 0: + cu_seqlens = F.pad(cu_seqlens, (1, 1), value=0) + cu_seqlens[-1] = cu_seqlens[-2] + num_pad + else: + cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0) + + # add batch size + if hidden_states.ndim == 2: + hidden_states = hidden_states.unsqueeze(dim=1) + + # pre-compute seqlens for attn mask to reduce cuMemcpy operations + max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens) + + for i, blk in enumerate(self.blocks): + hidden_states = blk( + hidden_states, + cu_seqlens=cu_seqlens, + rotary_pos_emb=rotary_pos_emb, + max_seqlen=max_seqlen, + seqlens=seqlens, + ) + + final_output = self.ln(hidden_states) + + if final_output.ndim == 3: + final_output = final_output.squeeze(dim=1) + + return final_output + + def load_weights(self, weights) -> set[str]: + params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: set[str] = set() + + for name, loaded_weight in weights: + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +# === Vision Inputs === # + + +class Ernie4_5_VLImagePixelInputs(TypedDict): + type: Literal["pixel_values"] + pixel_values: torch.Tensor + """Shape: + `(num_patches, num_channels * patch_size * patch_size)` + """ + + grid_thw: torch.Tensor + """Shape: `(num_images, 3)` + This should be in `(grid_t, grid_h, grid_w)` format. + """ + + +Ernie4_5_VLImageInputs = Ernie4_5_VLImagePixelInputs + + +class Ernie4_5_VLVideoPixelInputs(TypedDict): + type: Literal["pixel_values_videos"] + pixel_values_videos: torch.Tensor + """Shape: + `(num_patches, + num_channels * temporal_patch_size * patch_size * patch_size)` + """ + + video_grid_thw: torch.Tensor + """Shape: `(num_videos, 3)` + + This should be in `(grid_t, grid_h, grid_w)` format. + """ + + +Ernie4_5_VLVideoInputs = Ernie4_5_VLImagePixelInputs + +# === Vision Processor === # + + +def round_by_factor(number: Union[int, float], factor: int) -> int: + return round(number / factor) * factor + + +def ceil_by_factor(number: Union[int, float], factor: int) -> int: + return math.ceil(number / factor) * factor + + +def floor_by_factor(number: Union[int, float], factor: int) -> int: + return math.floor(number / factor) * factor + + +def smart_resize( + height: int, + width: int, + factor: int = 28, + min_pixels: int = 4 * 28 * 28, + max_pixels: int = 16384 * 28 * 28, +): + MAX_RATIO = 200 + if max(height, width) / min(height, width) > MAX_RATIO: + if height > width: + new_width = max(factor, round_by_factor(width, factor)) + new_height = floor_by_factor(new_width * MAX_RATIO, factor) + else: + new_height = max(factor, round_by_factor(height, factor)) + new_width = floor_by_factor(new_height * MAX_RATIO, factor) + + height = new_height + width = new_width + + h_bar = max(factor, round_by_factor(height, factor)) + w_bar = max(factor, round_by_factor(width, factor)) + if h_bar * w_bar > max_pixels: + beta = math.sqrt((height * width) / max_pixels) + h_bar = floor_by_factor(height / beta, factor) + w_bar = floor_by_factor(width / beta, factor) + elif h_bar * w_bar < min_pixels: + beta = math.sqrt(min_pixels / (height * width)) + h_bar = ceil_by_factor(height * beta, factor) + w_bar = ceil_by_factor(width * beta, factor) + + if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels: + raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}") + + return h_bar, w_bar + + +class VariableResolutionResamplerModel(nn.Module): + + def __init__(self, + in_dim, + out_dim, + spatial_conv_size, + temporal_conv_size, + config, + prefix: str = "") -> None: + super().__init__() + self.in_dim = in_dim + self.out_dim = out_dim + self.config = config + self.spatial_conv_size = spatial_conv_size + self.temporal_conv_size = temporal_conv_size + self.use_temporal_conv = config.use_temporal_conv + + # compress 2d conv(picture) to 1d + self.spatial_dim = (self.in_dim * self.spatial_conv_size * + self.spatial_conv_size) + # compress 3d conv(video) to 1d + self.temporal_dim = (self.in_dim * self.spatial_conv_size * + self.spatial_conv_size * self.temporal_conv_size) + + self.spatial_linear1 = ColumnParallelLinear( + self.spatial_dim, + self.spatial_dim, + bias=True, + gather_output=True, + quant_config=getattr(config, 'quant_config', None), + prefix=f"{prefix}.spatial_linear1", + ) + + self.spatial_gelu = nn.GELU() + + self.spatial_linear2 = ColumnParallelLinear( + self.spatial_dim, + self.spatial_dim, + bias=True, + gather_output=True, + quant_config=getattr(config, 'quant_config', None), + prefix=f"{prefix}.spatial_linear2", + ) + + self.spatial_norm = nn.LayerNorm(self.spatial_dim, eps=1e-6) + + if self.use_temporal_conv: + self.temporal_linear1 = ColumnParallelLinear( + self.temporal_dim, + self.spatial_dim, + bias=True, + gather_output=True, + quant_config=getattr(config, 'quant_config', None), + prefix=f"{prefix}.temporal_linear1", + ) + + self.temporal_gelu = nn.GELU() + + self.temporal_linear2 = ColumnParallelLinear( + self.spatial_dim, + self.spatial_dim, + bias=True, + gather_output=True, + quant_config=getattr(config, 'quant_config', None), + prefix=f"{prefix}.temporal_linear2", + ) + + self.temporal_norm = nn.LayerNorm(self.spatial_dim, eps=1e-6) + + self.mlp = ColumnParallelLinear( + self.spatial_dim, + self.out_dim, + bias=True, + gather_output=True, + quant_config=getattr(config, 'quant_config', None), + prefix=f"{prefix}.mlp", + ) + + self.after_norm = RMSNorm(hidden_size=out_dim, + eps=getattr(config, 'rms_norm_eps', 1e-6)) + + def spatial_conv_reshape(self, x, spatial_conv_size): + S, C = x.shape + x = x.reshape([-1, C * (spatial_conv_size**2)]) + return x + + def forward(self, x, grid_thw): + + def fwd_spatial(x): + x = self.spatial_conv_reshape(x, self.spatial_conv_size) + + x, _ = self.spatial_linear1(x) + x = self.spatial_gelu(x) + x, _ = self.spatial_linear2(x) + x = self.spatial_norm(x) + + return x + + def fwd_placeholder(x, grid_thw, to_tensor=False): + + grid_thw_cpu = grid_thw.cpu().numpy() + grid_t, grid_hw = grid_thw_cpu[:, 0], grid_thw_cpu[:, 1:] + grid_hw_after_conv = grid_hw.prod(-1) // (self.spatial_conv_size** + 2) + + tokens_per_img_or_vid = grid_thw_cpu.prod(-1) // ( + self.spatial_conv_size**2) + batch_offset = np.empty(tokens_per_img_or_vid.size, + dtype=tokens_per_img_or_vid.dtype) + batch_offset[0] = 0 + batch_offset[1:] = tokens_per_img_or_vid.cumsum()[:-1] + + slice_offsets = [] + for temporoal_size, spatial_size, b_offset in zip( + grid_t, grid_hw_after_conv, batch_offset): + for temp_offset in range(0, temporoal_size, 2): + slice_offsets.append( + np.arange( + b_offset + (temp_offset) * spatial_size, + b_offset + (temp_offset + 1) * spatial_size, + )) + slice_offsets = torch.tensor(np.concatenate(slice_offsets, + axis=-1)).to(x.device) + + slice_offsets2 = [] + for temporoal_size, spatial_size, b_offset in zip( + grid_t, grid_hw_after_conv, batch_offset): + for temp_offset in range(1 if temporoal_size > 1 else 0, + temporoal_size, 2): + slice_offsets2.append( + np.arange( + b_offset + (temp_offset) * spatial_size, + b_offset + (temp_offset + 1) * spatial_size, + )) + slice_offsets2 = torch.tensor( + np.concatenate(slice_offsets2, axis=-1)).to(x.device) + + x_timestep_1 = torch.index_select(x, dim=0, index=slice_offsets) + x_timestep_2 = torch.index_select(x, dim=0, index=slice_offsets2) + x = torch.concat([x_timestep_1, x_timestep_2], dim=-1) + return x + + def fwd_temporal(x): + x, _ = self.temporal_linear1(x) + x = self.temporal_gelu(x) + x, _ = self.temporal_linear2(x) + x = self.temporal_norm(x) + return x + + def fwd_mlp(x): + x, _ = self.mlp(x) + x = self.after_norm(x) + return x + + x = fwd_spatial(x) + if self.use_temporal_conv: + x = fwd_placeholder(x, grid_thw) + x = fwd_temporal(x) + x = fwd_mlp(x) + return x + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + + params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: set[str] = set() + + for name, loaded_weight in weights: + if name not in params_dict: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +class Ernie4_5_VLProcessingInfo(BaseProcessingInfo): + + def get_hf_config(self): + return self.ctx.model_config.hf_config + + def get_hf_processor(self, **kwargs: object): + return self.ctx.get_hf_processor(use_fast=True, **kwargs) + + def get_image_processor(self, **kwargs: object): + return self.get_hf_processor(**kwargs).image_processor + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None, "video": None} + + def _get_vision_info( + self, + *, + image_width: int, + image_height: int, + num_frames: int = 1, + do_resize: bool = True, + image_processor: Optional[Any], + ) -> tuple[ImageSize, int]: + if image_processor is None: + image_processor = self.get_image_processor() + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + + patch_size = vision_config.patch_size + spatial_conv_size = hf_config.spatial_conv_size + temporal_conv_size = hf_config.temporal_conv_size + + if do_resize: + resized_height, resized_width = smart_resize( + height=image_height, + width=image_width, + factor=patch_size * spatial_conv_size, + min_pixels=image_processor.min_pixels, + max_pixels=image_processor.max_pixels, + ) + preprocessed_size = ImageSize(width=resized_width, + height=resized_height) + else: + preprocessed_size = ImageSize(width=image_width, + height=image_height) + + grid_t = max(num_frames // temporal_conv_size, 1) + grid_h = preprocessed_size.height // patch_size + grid_w = preprocessed_size.width // patch_size + + num_patches = grid_t * grid_h * grid_w + num_vision_tokens = num_patches // (spatial_conv_size**2) + + return preprocessed_size, num_vision_tokens + + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + image_processor: Optional[Any], + ) -> int: + _, num_image_tokens = self._get_vision_info( + image_width=image_width, + image_height=image_height, + image_processor=image_processor, + ) + return num_image_tokens + + def get_num_video_tokens( + self, + *, + image_width: int, + image_height: int, + num_frames: int, + image_processor: Optional[Any], + ) -> int: + _, num_video_tokens = self._get_vision_info( + image_width=image_width, + image_height=image_height, + num_frames=num_frames, + image_processor=image_processor, + ) + return num_video_tokens + + def get_image_size_with_most_features(self) -> ImageSize: + max_image_size, _ = self._get_vision_info( + image_width=9999999, + image_height=9999999, + image_processor=None, + ) + return max_image_size + + def get_max_image_tokens(self) -> int: + target_width, target_height = self.get_image_size_with_most_features() + + num_image_tokens = self.get_num_image_tokens( + image_width=target_width, + image_height=target_height, + image_processor=None, + ) + return num_image_tokens + + def _get_max_video_frames(self, max_tokens: int) -> int: + target_width, target_height = self.get_image_size_with_most_features() + + num_frames = 0 + + while True: + next_num_frames = num_frames + 1 + next_max_tokens = self.get_num_video_tokens( + image_width=target_width, + image_height=target_height, + num_frames=next_num_frames, + image_processor=None, + ) + + if next_max_tokens > max_tokens: + break + + num_frames = next_num_frames + + # If the number of frames is odd, discard one frame. + if num_frames % 2 != 0: + num_frames -= 1 + + return num_frames + + def get_num_frames_with_most_features( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> int: + max_images = mm_counts.get("image", 0) + max_videos = mm_counts.get("video", 0) + + max_image_tokens = self.get_max_image_tokens() * max_images + max_total_frames = self._get_max_video_frames(seq_len - + max_image_tokens) + max_frames_per_video = min(max_total_frames // max(max_videos, 1), + _MAX_FRAMES_PER_VIDEO) + + return max(max_frames_per_video, 2) + + def get_max_video_tokens( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> int: + target_width, target_height = self.get_image_size_with_most_features() + + return self.get_num_video_tokens( + image_width=target_width, + image_height=target_height, + num_frames=self.get_num_frames_with_most_features( + seq_len, mm_counts), + image_processor=None, + ) + + +class Ernie4_5VLMultiModalProcessor( + BaseMultiModalProcessor[Ernie4_5_VLProcessingInfo]): + + def _pixel_values_norm( + self, + pixel_values: torch.Tensor, + mm_kwargs: object, + ) -> torch.Tensor: + hf_config = self.info.get_hf_config() + vision_config = hf_config.vision_config + image_processor = self.info.get_image_processor(**mm_kwargs) + image_mean_tensor = torch.tensor(image_processor.image_mean, + dtype=torch.float32).reshape( + [1, 3, 1, 1]) + image_std_tensor = torch.tensor(image_processor.image_std, + dtype=torch.float32).reshape( + [1, 3, 1, 1]) + rescale_factor = torch.tensor(image_processor.rescale_factor, + dtype=torch.float32) + patch_size_squared = vision_config.patch_size**2 + + image_mean_tensor = (image_mean_tensor.squeeze( + [-2, -1]).repeat_interleave(patch_size_squared, -1)) + image_std_tensor = (image_std_tensor.squeeze( + [-2, -1]).repeat_interleave(patch_size_squared, -1)) + + if not image_mean_tensor.is_contiguous(): + image_mean_tensor = image_mean_tensor.contiguous() + if not image_std_tensor.is_contiguous(): + image_std_tensor = image_std_tensor.contiguous() + + pixel_values = (rescale_factor * pixel_values.to(torch.float32) - + image_mean_tensor) / image_std_tensor + pixel_values = pixel_values.to(hf_config.torch_dtype) + return pixel_values + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + tok_kwargs: Mapping[str, object], + ) -> BatchFeature: + # when the prompt is not empty but the multimodal data is empty, + # directly invoke the tokenizer. + if "images" not in mm_data and "videos" not in mm_data and prompt != "": + tokenizer = self.info.get_tokenizer() + prompt_ids = tokenizer.encode(prompt) + tokenizer_output = BatchFeature(dict(input_ids=[prompt_ids]), + tensor_type="pt") + return tokenizer_output + + if "images" not in mm_data: + mm_data["images"] = [] + if "videos" not in mm_data: + mm_data["videos"] = [] + processor_output = self.info.ctx.call_hf_processor( + self.info.get_hf_processor(**mm_kwargs), + dict(text=[prompt], + images=mm_data["images"], + videos=mm_data["videos"]), + dict(**mm_kwargs, **tok_kwargs), + ) + + # Divide the processor_output into two modalities: image and video. + if processor_output is not None: + pixel_values = processor_output['images'] + if pixel_values is not None: + processor_output['images'] = self._pixel_values_norm( + pixel_values, mm_kwargs) + for key in list(processor_output.keys()): + if processor_output[key] is None: + del processor_output[key] + continue + if key == "grid_thw": + grid_thw = processor_output['grid_thw'] + pixel_values_all = processor_output['images'] + # Identify elements where the first + # dimension is greater than 1 and + # treat them as the video modality + mask = grid_thw[:, 0] > 1 + processor_output["video_grid_thw"] = grid_thw[mask] + processor_output["image_grid_thw"] = grid_thw[~mask] + image_patch_num = processor_output["image_grid_thw"].prod( + dim=1).sum() + processor_output[ + 'pixel_values'] = pixel_values_all[:image_patch_num] + processor_output['pixel_values_videos'] = pixel_values_all[ + image_patch_num:] + del processor_output['images'] + + return processor_output + + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, Any], + out_mm_kwargs: MultiModalKwargsItems, + ) -> Sequence[PromptUpdate]: + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + + before_placeholder = { + "image": "<|image@placeholder|>", + "video": "<|video@placeholder|>" + } + + after_placeholder = { + # image and video have same placeholder + "image": "<|IMAGE_PLACEHOLDER|>", + "video": "<|IMAGE_PLACEHOLDER|>" + } + + merge_length = hf_processor.spatial_conv_size**2 + + def get_replacement_ernie45vl(item_idx: int, modality: str): + out_item = out_mm_kwargs[modality][item_idx] + grid_thw = out_item[f"{modality}_grid_thw"].data + assert isinstance(grid_thw, torch.Tensor) + if modality == "video": + num_tokens = int(grid_thw.prod( + )) // hf_processor.temporal_conv_size // merge_length + else: + num_tokens = int(grid_thw.prod()) // merge_length + return after_placeholder[modality] * num_tokens + + return [ + PromptReplacement( + modality=modality, + target=before_placeholder[modality], + replacement=partial(get_replacement_ernie45vl, + modality=modality), + ) for modality in ("image", "video") + ] + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + + image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3))) + image_grid_sizes = image_grid_thw.prod(-1) + + video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3))) + video_grid_sizes = video_grid_thw.prod(-1) + + return dict( + pixel_values=MultiModalFieldConfig.flat_from_sizes( + "image", image_grid_sizes), + image_grid_thw=MultiModalFieldConfig.batched("image"), + pixel_values_videos=MultiModalFieldConfig.flat_from_sizes( + "video", video_grid_sizes), + video_grid_thw=MultiModalFieldConfig.batched("video"), + ) + + +class Ernie4_5_VLDummyInputsBuilder( + BaseDummyInputsBuilder[Ernie4_5_VLProcessingInfo]): + + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + num_images = mm_counts.get("image", 0) + num_videos = mm_counts.get("video", 0) + prompt = "" + for i in range(num_images): + prompt += (f"Picture {i+1}:" + "<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>") + + for i in range(num_videos): + prompt += (f"Video {i+1}:" + "<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>") + return prompt + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> MultiModalDataDict: + num_images = mm_counts.get("image", 0) + num_videos = mm_counts.get("video", 0) + + target_width, target_height = \ + self.info.get_image_size_with_most_features() + target_num_frames = \ + self.info.get_num_frames_with_most_features(seq_len, mm_counts) + + return { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images), + "video": + self._get_dummy_videos(width=target_width, + height=target_height, + num_frames=target_num_frames, + num_videos=num_videos) + } + + +@MULTIMODAL_REGISTRY.register_processor( + Ernie4_5VLMultiModalProcessor, + info=Ernie4_5_VLProcessingInfo, + dummy_inputs=Ernie4_5_VLDummyInputsBuilder) +class Ernie4_5_VLMoeForConditionalGeneration(nn.Module, SupportsMultiModal, + SupportsLoRA, SupportsPP): + + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + # To ensure correct weight loading and mapping. + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + "lm_head.": "language_model.lm_head.", + "model.": "language_model.model.", + # model.resampler_model.-> language_model.model.resampler_model. + # language_model.model.resampler_model. -> resampler_model. + "language_model.model.resampler_model.": "resampler_model.", + }, + # resampler_weight_mappings + orig_to_new_substr={ + "spatial_linear.0.": "spatial_linear1.", + "spatial_linear.2.": "spatial_linear2.", + "spatial_linear.3.": "spatial_norm.", + "temporal_linear.0.": "temporal_linear1.", + "temporal_linear.2.": "temporal_linear2.", + "temporal_linear.3.": "temporal_norm.", + }) + + @classmethod + def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: + if modality.startswith("image"): + return "<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>" + if modality.startswith("video"): + return "<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>" + + raise ValueError("Only image or video modality is supported") + + def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + multimodal_config = vllm_config.model_config.multimodal_config + + self.config = config + self.multimodal_config = multimodal_config + + self.vision_model = Ernie4_5_VisionTransformer( + config.vision_config, + norm_eps=getattr(config, "rms_norm_eps", 1e-6), + quant_config=quant_config, + prefix=maybe_prefix(prefix, "vision_model"), + ) + + self.language_model = Ernie4_5_VLMoeForCausalLM( + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "language_model"), + ) + + self.resampler_model = VariableResolutionResamplerModel( + self.config.pixel_hidden_size, + self.config.hidden_size, + self.config.spatial_conv_size, + self.config.temporal_conv_size, + config=self.config, + prefix=maybe_prefix(prefix, "resampler_model")) + + self.visual_token_mask = None + self.make_empty_intermediate_tensors = ( + self.language_model.make_empty_intermediate_tensors) + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + """compute logits""" + return self.language_model.compute_logits(hidden_states, + sampling_metadata) + + def _vision_forward( + self, + pixel_values: torch.Tensor, + grid_thw: torch.Tensor, + ) -> torch.Tensor: + if grid_thw is not None: + grid_thw = grid_thw[grid_thw > 0] + if grid_thw.numel() % 3 != 0: + raise ValueError( + f"grid_thw has {grid_thw.numel()} elements after filtering," + "which is not divisible by 3.") + grid_thw = grid_thw.reshape(-1, 3) + # example: [[1,64,64],[2,80,80]] -> [[1,64,64],[1,80,80],[1,80,80]] + grid_thw = F.pad( + torch.repeat_interleave(grid_thw[:, 1:], grid_thw[:, 0], 0), + [1, 0, 0, 0], + value=1, + ) + image_features = self.vision_model(pixel_values, grid_thw) + return image_features + + def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None: + if getattr(self.config, "im_patch_id", None) is not None: + self.visual_token_mask = ( + input_ids == self.config.im_patch_id).reshape(-1, 1) + else: + self.visual_token_mask = None + + def get_language_model(self) -> torch.nn.Module: + return self.language_model + + def _validate_and_reshape_mm_tensor(self, mm_input: object, + name: str) -> torch.Tensor: + if not isinstance(mm_input, (torch.Tensor, list)): + raise ValueError(f"Incorrect type of {name}. " + f"Got type: {type(mm_input)}") + if isinstance(mm_input, torch.Tensor): + if mm_input.ndim == 2: + return mm_input + if mm_input.ndim != 3: + raise ValueError(f"{name} should be 2D or batched 3D tensor. " + f"Got ndim: {mm_input.ndim} " + f"(shape={mm_input.shape})") + return torch.concat(list(mm_input)) + else: + return torch.concat(mm_input) + + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[Ernie4_5_VLImageInputs]: + pixel_values = kwargs.pop("pixel_values", None) + image_grid_thw = kwargs.pop("image_grid_thw", None) + + if pixel_values is None: + return None + + if pixel_values is not None: + pixel_values = self._validate_and_reshape_mm_tensor( + pixel_values, "image pixel values") + image_grid_thw = self._validate_and_reshape_mm_tensor( + image_grid_thw, "image grid_thw") + + if not isinstance(pixel_values, (torch.Tensor, list)): + raise ValueError("Incorrect type of image pixel values. " + f"Got type: {type(pixel_values)}") + + return Ernie4_5_VLImagePixelInputs(type="pixel_values", + pixel_values=pixel_values, + image_grid_thw=image_grid_thw) + + def _parse_and_validate_video_input( + self, **kwargs: object) -> Optional[Ernie4_5_VLVideoInputs]: + pixel_values_videos = kwargs.pop("pixel_values_videos", None) + video_grid_thw = kwargs.pop("video_grid_thw", None) + + if pixel_values_videos is None: + return None + + if pixel_values_videos is not None: + pixel_values_videos = self._validate_and_reshape_mm_tensor( + pixel_values_videos, "video pixel values") + video_grid_thw = self._validate_and_reshape_mm_tensor( + video_grid_thw, "video grid_thw") + + return Ernie4_5_VLVideoPixelInputs( + type="pixel_values_videos", + pixel_values_videos=pixel_values_videos, + video_grid_thw=video_grid_thw, + ) + + def _process_image_input( + self, + image_input: Ernie4_5_VLImageInputs) -> tuple[torch.Tensor, ...]: + + grid_thw = image_input["image_grid_thw"] + assert grid_thw.ndim == 2 + + pixel_values = image_input["pixel_values"].type( + self.vision_model.dtype) + image_features = self._vision_forward(pixel_values=pixel_values, + grid_thw=grid_thw) + image_embeds = self.resampler_model(image_features, grid_thw) + + merge_size = self.vision_model.spatial_merge_size + sizes = grid_thw.prod(-1) // merge_size // merge_size + + return image_embeds.split(sizes.tolist()) + + def _process_video_input( + self, + video_input: Ernie4_5_VLVideoInputs) -> tuple[torch.Tensor, ...]: + + grid_thw = video_input["video_grid_thw"] + assert grid_thw.ndim == 2 + + pixel_values_videos = video_input["pixel_values_videos"].type( + self.vision_model.dtype) + video_features = self._vision_forward(pixel_values=pixel_values_videos, + grid_thw=grid_thw) + video_embeds = self.resampler_model(video_features, grid_thw) + + merge_size = self.vision_model.spatial_merge_size + sizes = (grid_thw.prod(-1) // + self.config.temporal_conv_size) // merge_size // merge_size + + return video_embeds.split(sizes.tolist()) + + def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: + modalities = {} + + # Preserve the order of modalities if there are multiple of them + # from the order of kwargs. + for input_key in kwargs: + if input_key in ("pixel_values", + "image_embeds") and "images" not in modalities: + modalities["images"] = self._parse_and_validate_image_input( + **kwargs) + if input_key in ("pixel_values_videos", + "video_embeds") and "videos" not in modalities: + modalities["videos"] = self._parse_and_validate_video_input( + **kwargs) + + return modalities + + def get_multimodal_embeddings( + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + + modalities = self._parse_and_validate_multimodal_inputs(**kwargs) + if not modalities: + return None + + # The result multimodal_embeddings is tuple of tensors, with each + # tensor correspoending to a multimodal data item (image or video). + multimodal_embeddings: tuple[torch.Tensor, ...] = () + + # NOTE: It is important to iterate over the keys in this dictionary + # to preserve the order of the modalities. + for modality in modalities: + if modality == "images": + image_input = modalities["images"] + vision_embeddings = self._process_image_input(image_input) + multimodal_embeddings += vision_embeddings + if modality == "videos": + video_input = modalities["videos"] + video_embeddings = self._process_video_input(video_input) + multimodal_embeddings += video_embeddings + + return multimodal_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + ) -> torch.Tensor: + + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + + if multimodal_embeddings is None: + return inputs_embeds + + self._set_visual_token_mask(input_ids) + inputs_embeds = merge_multimodal_embeddings(input_ids, inputs_embeds, + multimodal_embeddings, + [self.config.im_patch_id]) + return inputs_embeds + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs, + ): + + forward_kwargs = { + "input_ids": input_ids, + "positions": positions, + "intermediate_tensors": intermediate_tensors, + "inputs_embeds": inputs_embeds, + } + + if self.visual_token_mask is not None: + + if self.visual_token_mask.shape[0] != inputs_embeds.shape[0]: + padding_len = inputs_embeds.shape[ + 0] - self.visual_token_mask.shape[0] + # right pad False + pad = torch.zeros( + (padding_len, self.visual_token_mask.shape[1]), + dtype=self.visual_token_mask.dtype, + device=self.visual_token_mask.device) + self.visual_token_mask = torch.cat( + [self.visual_token_mask, pad], dim=0) + + forward_kwargs.update( + {"visual_token_mask": self.visual_token_mask}) + self.visual_token_mask = None + + hidden_states = self.language_model.model( + **forward_kwargs, + **kwargs, + ) + + return hidden_states + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + + loader = AutoWeightsLoader(self) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py new file mode 100644 index 0000000000000..f56c098435154 --- /dev/null +++ b/vllm/model_executor/models/ernie45_vl_moe.py @@ -0,0 +1,723 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright 2025 The Baidu team. +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only Erine VL model compatible with HuggingFace weights.""" +from collections.abc import Iterable +from typing import Any, Optional, Union + +import torch +from torch import nn +from transformers import PretrainedConfig + +from vllm.attention import Attention +# from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, VllmConfig +from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (QKVParallelLinear, + ReplicatedLinear, + RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding.ernie45_vl_rope import ( + Ernie4_5_VLRotaryEmbedding) +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, maybe_remap_kv_scale_name) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import IntermediateTensors + +from .ernie45_moe import Ernie4_5_MoeMLP +from .interfaces import SupportsPP +from .utils import (PPMissingLayer, extract_layer_index, + is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) + +logger = init_logger(__name__) + + +class Ernie4_5_VLMoeMLP(Ernie4_5_MoeMLP): + pass + + +class Ernie4_5_VLMoeAttention(nn.Module): + + def __init__( + self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + head_dim: Optional[int] = None, + rope_theta: float = 500000, + rope_scaling: Optional[dict[str, Any]] = None, + freq_allocation: int = 20, + max_position_embeddings: int = 131072, + rms_norm_eps: float = 1e-05, + qkv_bias: bool = False, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + layer_idx = extract_layer_index(prefix) if len(prefix) > 0 else 0 + self.layer_idx = layer_idx + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = head_dim or (hidden_size // self.total_num_heads) + + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.rope_theta = rope_theta + self.max_position_embeddings = max_position_embeddings + + self.qkv_proj = QKVParallelLinear(hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=qkv_bias, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj") + + self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim, + hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj") + + t_rope = freq_allocation + h_rope = (self.head_dim // 2 - freq_allocation) // 2 + w_rope = (self.head_dim // 2 - freq_allocation) // 2 + + self.rotary_emb = Ernie4_5_VLRotaryEmbedding( + head_size=self.head_dim, + rotary_dim=self.head_dim, + max_position_embeddings=max_position_embeddings, + base=rope_theta, + is_neox_style=False, + dtype=torch.get_default_dtype(), + mrope_section=[h_rope, w_rope, t_rope]) + + self.attn = Attention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attn") + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + ) -> torch.Tensor: + + qkv, _ = self.qkv_proj(hidden_states) + + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = self.rotary_emb(positions, q, k) + + # Attention + attn_output = self.attn(q, k, v) + # Output projection + output, _ = self.o_proj(attn_output) + return output + + +class Ernie4_5_VLMoeMoE(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + super().__init__() + + layer_idx = extract_layer_index(prefix) + self.layer_idx = layer_idx + self.tp_size = get_tensor_model_parallel_world_size() + self.has_shared_experts = (getattr(config, "moe_num_shared_experts", 0) + > 0) + self.hidden_size = config.hidden_size + + moe_num_experts = config.moe_num_experts + max_moe_num_experts = max(moe_num_experts) + + if self.tp_size > max_moe_num_experts: + raise ValueError( + f"Tensor parallel size {self.tp_size} is greater than " + f"the number of experts {moe_num_experts}.") + + moe_layer_start_index = config.moe_layer_start_index + text_moe_layer_start_index = moe_layer_start_index[0] + vision_moe_layer_start_index = moe_layer_start_index[1] + moe_layer_end_index = config.moe_layer_end_index + moe_layer_end_index = getattr( + config, "moe_layer_end_index", + [config.num_hidden_layers - 1, config.num_hidden_layers - 1]) + text_moe_layer_end_index = moe_layer_end_index[0] + vision_moe_layer_end_index = moe_layer_end_index[1] + + assert config.moe_num_experts[0] == config.moe_num_experts[1] + self.e_score_correction_bias = nn.Parameter( + torch.empty(2, config.moe_num_experts[0])) + + assert text_moe_layer_start_index <= text_moe_layer_end_index + + if layer_idx >= text_moe_layer_start_index and \ + layer_idx <= text_moe_layer_end_index: + self.text_experts_gate = ReplicatedLinear( + config.hidden_size, + config.moe_num_experts[0], + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.text_experts_gate") + + self.text_experts = FusedMoE( + num_experts=config.moe_num_experts[0], + top_k=config.moe_k, + hidden_size=config.hidden_size, + intermediate_size=config.moe_intermediate_size[0], + reduce_results=False, + renormalize=True, + quant_config=quant_config, + e_score_correction_bias=self.e_score_correction_bias[0], + prefix=f"{prefix}.text_experts") + else: + self.text_experts = Ernie4_5_VLMoeMLP( + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + use_bias=getattr(config, 'use_bias', False), + quant_config=quant_config, + prefix=f"{prefix}.mlp") + + assert vision_moe_layer_start_index <= vision_moe_layer_end_index + if layer_idx >= vision_moe_layer_start_index and \ + layer_idx <= vision_moe_layer_end_index: + self.vision_experts_gate = ReplicatedLinear( + config.hidden_size, + config.moe_num_experts[1], + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.vision_experts_gate") + + self.vision_experts = FusedMoE( + num_experts=config.moe_num_experts[1], + top_k=config.moe_k, + hidden_size=config.hidden_size, + intermediate_size=config.moe_intermediate_size[1], + reduce_results=False, + renormalize=True, + quant_config=quant_config, + e_score_correction_bias=self.e_score_correction_bias[1], + prefix=f"{prefix}.vision_experts") + else: + self.vision_experts = Ernie4_5_VLMoeMLP( + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + use_bias=getattr(config, 'use_bias', False), + quant_config=quant_config, + prefix=f"{prefix}.mlp") + + if self.has_shared_experts: + intermediate_size = (config.moe_intermediate_size[0] * + config.moe_num_shared_experts) + self.shared_experts = Ernie4_5_VLMoeMLP( + hidden_size=config.hidden_size, + intermediate_size=intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + prefix=f"{prefix}.shared_experts", + reduce_results=self.text_experts. + must_reduce_shared_expert_outputs()) + + def forward( + self, + hidden_states: torch.Tensor, + visual_token_mask: torch.Tensor, + **kwargs: object, + ) -> torch.Tensor: + + orig_shape = hidden_states.shape + hidden_dim = hidden_states.shape[-1] + hidden_states = hidden_states.view(-1, hidden_dim) + + if self.has_shared_experts: + shared_output = self.shared_experts(hidden_states) + + if visual_token_mask is not None and visual_token_mask.any(): + # assert visual_token_mask.shape[0] != hidden_states.shape[0] + visual_token_mask = visual_token_mask.repeat( + 1, self.hidden_size).bool() + text_token_mask = ~visual_token_mask + final_hidden_states = torch.zeros_like(hidden_states) + + text_hidden_states = hidden_states[text_token_mask].reshape( + -1, self.hidden_size) + vision_hidden_states = hidden_states[visual_token_mask].reshape( + -1, self.hidden_size) + + text_router_logits, _ = self.text_experts_gate(text_hidden_states) + final_hidden_states[text_token_mask] = self.text_experts( + hidden_states=text_hidden_states, + router_logits=text_router_logits).flatten() + + vision_router_logits, _ = self.vision_experts_gate( + vision_hidden_states) + final_hidden_states[visual_token_mask] = self.vision_experts( + hidden_states=vision_hidden_states, + router_logits=vision_router_logits).flatten() + else: + # text modal input processing directly + text_router_logits, _ = self.text_experts_gate(hidden_states) + + final_hidden_states = self.text_experts( + hidden_states=hidden_states, router_logits=text_router_logits) + + if self.has_shared_experts and \ + shared_output is not None: + final_hidden_states = final_hidden_states + shared_output + + if self.tp_size > 1: + final_hidden_states = ( + self.text_experts.maybe_all_reduce_tensor_model_parallel( + final_hidden_states)) + + return final_hidden_states.view(orig_shape) + + +class Ernie4_5_VLMoeDecoderLayer(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + rope_theta = getattr(config, "rope_theta", 500000) + rope_scaling = getattr(config, "rope_scaling", None) + freq_allocation = getattr(config, "freq_allocation", 20) + max_position_embeddings = getattr(config, "max_position_embeddings", + 131072) + + self.self_attn = Ernie4_5_VLMoeAttention( + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=config.num_key_value_heads, + head_dim=getattr(config, 'head_dim', None), + rope_theta=rope_theta, + rope_scaling=rope_scaling, + freq_allocation=freq_allocation, + max_position_embeddings=max_position_embeddings, + rms_norm_eps=config.rms_norm_eps, + qkv_bias=getattr(config, 'use_bias', False), + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) + + layer_idx = extract_layer_index(prefix) + self.layer_idx = layer_idx + + # MoE + moe_layer_start_index = config.moe_layer_start_index + min_moe_layer_start_index = min(moe_layer_start_index) + moe_layer_end_index = getattr( + config, "moe_layer_end_index", + [config.num_hidden_layers - 1, config.num_hidden_layers - 1]) + max_moe_layer_end_index = max(moe_layer_end_index) + assert min_moe_layer_start_index <= max_moe_layer_end_index + moe_num_experts = config.moe_num_experts + max_moe_num_experts = max(moe_num_experts) + moe_layer_interval = getattr(config, "moe_layer_interval", 1) + use_moe = getattr(config, "use_moe", max_moe_num_experts > 0) + + if (use_moe and ((layer_idx + 1) % moe_layer_interval == 0) + and layer_idx >= min_moe_layer_start_index + and layer_idx <= max_moe_layer_end_index): + self.mlp = Ernie4_5_VLMoeMoE(config=config, + quant_config=quant_config, + prefix=f"{prefix}.mlp") + else: + self.mlp = Ernie4_5_VLMoeMLP( + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + use_bias=getattr(config, 'use_bias', False), + quant_config=quant_config, + prefix=f"{prefix}.mlp") + + self.input_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: Optional[torch.Tensor], + visual_token_mask: Optional[torch.Tensor], + **kwargs: object, + ) -> torch.Tensor: + + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm( + hidden_states, residual) + + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + ) + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) + + if isinstance(self.mlp, Ernie4_5_VLMoeMoE): + hidden_states = self.mlp(hidden_states, visual_token_mask, + **kwargs) + else: + hidden_states = self.mlp(hidden_states) + + return hidden_states, residual + + +# Since Ernie VL distinguishes between text experts and vision experts, +# enabling torch.compile will cause errors. +# @support_torch_compile( +# dynamic_arg_dims={ +# "input_ids": 0, +# "positions": -1, +# "intermediate_tensors": 0, +# "inputs_embeds": 0, +# "visual_token_mask": 0, +# }) +class Ernie4_5_VLMoeModel(nn.Module): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + self.config = config + + self.im_patch_id = config.im_patch_id + + if get_pp_group().is_first_rank: + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.embed_tokens") + else: + self.embed_tokens = PPMissingLayer() + + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + lambda prefix: Ernie4_5_VLMoeDecoderLayer( + config=config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix), + prefix=f"{prefix}.layers", + ) + + if get_pp_group().is_last_rank: + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + else: + self.norm = PPMissingLayer() + + self.make_empty_intermediate_tensors = ( + make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size)) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + visual_token_mask: Optional[torch.Tensor] = None, + **kwargs: object, + ) -> Union[torch.Tensor, IntermediateTensors]: + + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + for i in range(self.start_layer, self.end_layer): + layer = self.layers[i] + hidden_states, residual = layer(positions, hidden_states, residual, + visual_token_mask, **kwargs) + + if not get_pp_group().is_last_rank: + return IntermediateTensors({ + "hidden_states": hidden_states, + "residual": residual + }) + + hidden_states, _ = self.norm(hidden_states, residual) + + return hidden_states + + +# only used as text backbone for ernie4.5-vl +class Ernie4_5_VLMoeForCausalLM(nn.Module, SupportsPP): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + fall_back_to_pt_during_load = False + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.config = config + self.quant_config = quant_config + self.model = Ernie4_5_VLMoeModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) + + if get_pp_group().is_last_rank: + self.lm_head = ParallelLMHead(config.vocab_size, + config.hidden_size, + quant_config=quant_config) + else: + self.lm_head = PPMissingLayer() + + if self.config.tie_word_embeddings: + self.lm_head.weight = self.model.embed_tokens.weight + self.logits_processor = LogitsProcessor(config.vocab_size) + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs: object, + ) -> Union[torch.Tensor, IntermediateTensors]: + hidden_states = self.model(input_ids, positions, intermediate_tensors, + inputs_embeds, **kwargs) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + expert_params_mapping = FusedMoE.make_expert_params_mapping( + ckpt_gate_proj_name="gate_proj", + ckpt_down_proj_name="down_proj", + ckpt_up_proj_name="up_proj", + num_experts=max(self.config.moe_num_experts)) + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if self.config.tie_word_embeddings and name.endswith( + "lm_head.weight"): + loaded_params.add("lm_head.weight") + continue + # MTP will be supported soon. + if "mtp" in name or \ + "vision_model" in name or \ + "resampler_model" in name: + continue + + for (param_name, weight_name, shard_id) in stacked_params_mapping: + # Skip non-stacked layers and experts (experts handled below). + if weight_name not in name: + continue + + if (("mlp.experts." in name) and name not in params_dict): + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if ((name.endswith(".bias") or name.endswith("_bias")) + and name not in params_dict): + continue + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Distinguish between vision experts and text experts + if "mlp.experts" in name: + moe_offset = int(name.split(".")[-3]) + vision_expert_start_idx = self.config.moe_num_experts[0] + is_text_expert = \ + moe_offset <= vision_expert_start_idx - 1 + if is_text_expert: + name = name.replace(".experts.", ".text_experts.") + else: + name = name.replace( + f".experts.{moe_offset}", + f".vision_experts.{moe_offset-vision_expert_start_idx}" + ) + + for mapping in expert_params_mapping: + param_name, weight_name, expert_id, shard_id = mapping + + if weight_name not in name: + continue + + # Distinguish between vision experts and text experts + moe_offset = int(name.split(".")[-3]) + is_text_expert = \ + moe_offset <= self.config.moe_num_experts[0] - 1 + + name = name.replace(weight_name, param_name) + if is_text_expert: + name = name.replace(".experts.", ".text_experts.") + else: + name = name.replace(".experts.", ".vision_experts.") + + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue + + # Skip loading extra bias for GPTQ models. + if ((name.endswith(".bias") or name.endswith("_bias")) + and name not in params_dict): + continue + param = params_dict[name] + + weight_loader = param.weight_loader + weight_loader(param, + loaded_weight, + name, + shard_id=shard_id, + expert_id=expert_id) + break + else: + # Distinguish between vision expert gate + # and text expert gate + if name.endswith("mlp.gate.weight"): + name = name.replace("gate.weight", + "text_experts_gate.weight") + loaded_weight = loaded_weight.T + elif name.endswith("mlp.gate.weight_1"): + name = name.replace("gate.weight_1", + "vision_experts_gate.weight") + loaded_weight = loaded_weight.T + + if "e_score_correction_bias" in name: + name = name.replace(".moe_statics.", ".") + + # Skip loading extra bias for GPTQ models. + if ((name.endswith(".bias") or name.endswith("_bias")) + and name not in params_dict): + continue + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + + param = params_dict[name] + + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index ebf78771e40a4..c65c58d4a047f 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -206,6 +206,7 @@ _MULTIMODAL_MODELS = { "ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"), # noqa: E501 "Cohere2VisionForConditionalGeneration": ("cohere2_vision", "Cohere2VisionForConditionalGeneration"), # noqa: E501 "DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"), + "Ernie4_5_VLMoeForConditionalGeneration": ("ernie45_vl", "Ernie4_5_VLMoeForConditionalGeneration"), # noqa: E501 "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"), "Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"), # noqa: E501 "Gemma3nForConditionalGeneration": ("gemma3n_mm", "Gemma3nForConditionalGeneration"), # noqa: E501 From 32102644213a6367d10ec3a92ae76fb0004f3a52 Mon Sep 17 00:00:00 2001 From: Chen Zhang <zhangch99@outlook.com> Date: Tue, 26 Aug 2025 21:58:59 -0700 Subject: [PATCH 067/112] [Frontend] Add --log-error-stack to print stack trace for error response (#22960) Signed-off-by: Chen Zhang <zhangch99@outlook.com> --- vllm/entrypoints/openai/api_server.py | 10 ++++++++++ vllm/entrypoints/openai/cli_args.py | 2 ++ vllm/entrypoints/openai/serving_chat.py | 4 +++- vllm/entrypoints/openai/serving_classification.py | 2 ++ vllm/entrypoints/openai/serving_completion.py | 2 ++ vllm/entrypoints/openai/serving_embedding.py | 4 +++- vllm/entrypoints/openai/serving_engine.py | 9 +++++++++ vllm/entrypoints/openai/serving_pooling.py | 4 +++- vllm/entrypoints/openai/serving_responses.py | 2 ++ vllm/entrypoints/openai/serving_score.py | 4 +++- vllm/entrypoints/openai/serving_tokenization.py | 4 +++- vllm/entrypoints/openai/serving_transcription.py | 8 ++++++-- vllm/entrypoints/openai/speech_to_text.py | 4 +++- 13 files changed, 51 insertions(+), 8 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index db02767fdfd71..9a2470649c8d2 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1749,6 +1749,7 @@ async def init_app_state( enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_force_include_usage=args.enable_force_include_usage, enable_log_outputs=args.enable_log_outputs, + log_error_stack=args.log_error_stack, ) if "generate" in supported_tasks else None state.openai_serving_chat = OpenAIServingChat( engine_client, @@ -1767,6 +1768,7 @@ async def init_app_state( enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_force_include_usage=args.enable_force_include_usage, enable_log_outputs=args.enable_log_outputs, + log_error_stack=args.log_error_stack, ) if "generate" in supported_tasks else None state.openai_serving_completion = OpenAIServingCompletion( engine_client, @@ -1776,6 +1778,7 @@ async def init_app_state( return_tokens_as_token_ids=args.return_tokens_as_token_ids, enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_force_include_usage=args.enable_force_include_usage, + log_error_stack=args.log_error_stack, ) if "generate" in supported_tasks else None state.openai_serving_pooling = OpenAIServingPooling( engine_client, @@ -1784,6 +1787,7 @@ async def init_app_state( request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, + log_error_stack=args.log_error_stack, ) if "encode" in supported_tasks else None state.openai_serving_embedding = OpenAIServingEmbedding( engine_client, @@ -1792,12 +1796,14 @@ async def init_app_state( request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, + log_error_stack=args.log_error_stack, ) if "embed" in supported_tasks else None state.openai_serving_classification = ServingClassification( engine_client, model_config, state.openai_serving_models, request_logger=request_logger, + log_error_stack=args.log_error_stack, ) if "classify" in supported_tasks else None enable_serving_reranking = ("classify" in supported_tasks and getattr( @@ -1807,6 +1813,7 @@ async def init_app_state( model_config, state.openai_serving_models, request_logger=request_logger, + log_error_stack=args.log_error_stack, ) if ("embed" in supported_tasks or enable_serving_reranking) else None state.openai_serving_tokenization = OpenAIServingTokenization( @@ -1816,18 +1823,21 @@ async def init_app_state( request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, + log_error_stack=args.log_error_stack, ) state.openai_serving_transcription = OpenAIServingTranscription( engine_client, model_config, state.openai_serving_models, request_logger=request_logger, + log_error_stack=args.log_error_stack, ) if "transcription" in supported_tasks else None state.openai_serving_translation = OpenAIServingTranslation( engine_client, model_config, state.openai_serving_models, request_logger=request_logger, + log_error_stack=args.log_error_stack, ) if "transcription" in supported_tasks else None state.enable_server_load_tracking = args.enable_server_load_tracking diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 6e4eff5c80243..d0b5d013eb9e5 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -180,6 +180,8 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`""" h11_max_header_count: int = H11_MAX_HEADER_COUNT_DEFAULT """Maximum number of HTTP headers allowed in a request for h11 parser. Helps mitigate header abuse. Default: 256.""" + log_error_stack: bool = envs.VLLM_SERVER_DEV_MODE + """If set to True, log the stack trace of error responses""" @staticmethod def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 7e0e627780970..1c0ffdfb91897 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -76,13 +76,15 @@ class OpenAIServingChat(OpenAIServing): enable_prompt_tokens_details: bool = False, enable_force_include_usage: bool = False, enable_log_outputs: bool = False, + log_error_stack: bool = False, ) -> None: super().__init__(engine_client=engine_client, model_config=model_config, models=models, request_logger=request_logger, return_tokens_as_token_ids=return_tokens_as_token_ids, - enable_force_include_usage=enable_force_include_usage) + enable_force_include_usage=enable_force_include_usage, + log_error_stack=log_error_stack) self.response_role = response_role self.chat_template = chat_template diff --git a/vllm/entrypoints/openai/serving_classification.py b/vllm/entrypoints/openai/serving_classification.py index 377f7f6847179..1d510d0b60a2d 100644 --- a/vllm/entrypoints/openai/serving_classification.py +++ b/vllm/entrypoints/openai/serving_classification.py @@ -129,12 +129,14 @@ class ServingClassification(ClassificationMixin): models: OpenAIServingModels, *, request_logger: Optional[RequestLogger], + log_error_stack: bool = False, ) -> None: super().__init__( engine_client=engine_client, model_config=model_config, models=models, request_logger=request_logger, + log_error_stack=log_error_stack, ) async def create_classify( diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index a0ce654094039..b81fd63ece7a4 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -59,6 +59,7 @@ class OpenAIServingCompletion(OpenAIServing): return_tokens_as_token_ids: bool = False, enable_prompt_tokens_details: bool = False, enable_force_include_usage: bool = False, + log_error_stack: bool = False, ): super().__init__( engine_client=engine_client, @@ -67,6 +68,7 @@ class OpenAIServingCompletion(OpenAIServing): request_logger=request_logger, return_tokens_as_token_ids=return_tokens_as_token_ids, enable_force_include_usage=enable_force_include_usage, + log_error_stack=log_error_stack, ) self.enable_prompt_tokens_details = enable_prompt_tokens_details self.default_sampling_params = ( diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 9dcad8e391c68..45c1932f1873c 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -593,11 +593,13 @@ class OpenAIServingEmbedding(EmbeddingMixin): request_logger: Optional[RequestLogger], chat_template: Optional[str], chat_template_content_format: ChatTemplateContentFormatOption, + log_error_stack: bool = False, ) -> None: super().__init__(engine_client=engine_client, model_config=model_config, models=models, - request_logger=request_logger) + request_logger=request_logger, + log_error_stack=log_error_stack) self.chat_template = chat_template self.chat_template_content_format: Final = chat_template_content_format diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 0f4a7c0186b65..a97935e109ef2 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -5,6 +5,7 @@ import io import json import sys import time +import traceback from collections.abc import AsyncGenerator, Iterable, Mapping, Sequence from concurrent.futures import ThreadPoolExecutor from http import HTTPStatus @@ -205,6 +206,7 @@ class OpenAIServing: request_logger: Optional[RequestLogger], return_tokens_as_token_ids: bool = False, enable_force_include_usage: bool = False, + log_error_stack: bool = False, ): super().__init__() @@ -222,6 +224,7 @@ class OpenAIServing: self._async_tokenizer_pool: dict[AnyTokenizer, AsyncMicrobatchTokenizer] = {} + self.log_error_stack = log_error_stack def _get_async_tokenizer(self, tokenizer) -> AsyncMicrobatchTokenizer: """ @@ -412,6 +415,12 @@ class OpenAIServing: message: str, err_type: str = "BadRequestError", status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> ErrorResponse: + if self.log_error_stack: + exc_type, _, _ = sys.exc_info() + if exc_type is not None: + traceback.print_exc() + else: + traceback.print_stack() return ErrorResponse(error=ErrorInfo( message=message, type=err_type, code=status_code.value)) diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py index 38745d001ade6..e8cb1aed84596 100644 --- a/vllm/entrypoints/openai/serving_pooling.py +++ b/vllm/entrypoints/openai/serving_pooling.py @@ -58,11 +58,13 @@ class OpenAIServingPooling(OpenAIServing): request_logger: Optional[RequestLogger], chat_template: Optional[str], chat_template_content_format: ChatTemplateContentFormatOption, + log_error_stack: bool = False, ) -> None: super().__init__(engine_client=engine_client, model_config=model_config, models=models, - request_logger=request_logger) + request_logger=request_logger, + log_error_stack=log_error_stack) self.chat_template = chat_template self.chat_template_content_format: Final = chat_template_content_format diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 67eec2d523e3f..899cb07b2b37d 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -88,6 +88,7 @@ class OpenAIServingResponses(OpenAIServing): enable_prompt_tokens_details: bool = False, enable_force_include_usage: bool = False, enable_log_outputs: bool = False, + log_error_stack: bool = False, ) -> None: super().__init__( engine_client=engine_client, @@ -96,6 +97,7 @@ class OpenAIServingResponses(OpenAIServing): request_logger=request_logger, return_tokens_as_token_ids=return_tokens_as_token_ids, enable_force_include_usage=enable_force_include_usage, + log_error_stack=log_error_stack, ) self.chat_template = chat_template diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index c246274514dbf..37838e22a4002 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -47,11 +47,13 @@ class ServingScores(OpenAIServing): models: OpenAIServingModels, *, request_logger: Optional[RequestLogger], + log_error_stack: bool = False, ) -> None: super().__init__(engine_client=engine_client, model_config=model_config, models=models, - request_logger=request_logger) + request_logger=request_logger, + log_error_stack=log_error_stack) async def _embedding_score( self, diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index 58d720474768b..2f258255d5f16 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -39,11 +39,13 @@ class OpenAIServingTokenization(OpenAIServing): request_logger: Optional[RequestLogger], chat_template: Optional[str], chat_template_content_format: ChatTemplateContentFormatOption, + log_error_stack: bool = False, ) -> None: super().__init__(engine_client=engine_client, model_config=model_config, models=models, - request_logger=request_logger) + request_logger=request_logger, + log_error_stack=log_error_stack) self.chat_template = chat_template self.chat_template_content_format: Final = chat_template_content_format diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py index 0d6989fe91bfa..9ba58d4425221 100644 --- a/vllm/entrypoints/openai/serving_transcription.py +++ b/vllm/entrypoints/openai/serving_transcription.py @@ -32,13 +32,15 @@ class OpenAIServingTranscription(OpenAISpeechToText): *, request_logger: Optional[RequestLogger], return_tokens_as_token_ids: bool = False, + log_error_stack: bool = False, ): super().__init__(engine_client=engine_client, model_config=model_config, models=models, request_logger=request_logger, return_tokens_as_token_ids=return_tokens_as_token_ids, - task_type="transcribe") + task_type="transcribe", + log_error_stack=log_error_stack) async def create_transcription( self, audio_data: bytes, request: TranscriptionRequest, @@ -88,13 +90,15 @@ class OpenAIServingTranslation(OpenAISpeechToText): *, request_logger: Optional[RequestLogger], return_tokens_as_token_ids: bool = False, + log_error_stack: bool = False, ): super().__init__(engine_client=engine_client, model_config=model_config, models=models, request_logger=request_logger, return_tokens_as_token_ids=return_tokens_as_token_ids, - task_type="translate") + task_type="translate", + log_error_stack=log_error_stack) async def create_translation( self, audio_data: bytes, request: TranslationRequest, diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py index de2619a78f8e0..1cbd7dba393f6 100644 --- a/vllm/entrypoints/openai/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text.py @@ -53,12 +53,14 @@ class OpenAISpeechToText(OpenAIServing): request_logger: Optional[RequestLogger], return_tokens_as_token_ids: bool = False, task_type: Literal["transcribe", "translate"] = "transcribe", + log_error_stack: bool = False, ): super().__init__(engine_client=engine_client, model_config=model_config, models=models, request_logger=request_logger, - return_tokens_as_token_ids=return_tokens_as_token_ids) + return_tokens_as_token_ids=return_tokens_as_token_ids, + log_error_stack=log_error_stack) self.default_sampling_params = ( self.model_config.get_diff_sampling_param()) From 142ac0803045b3a3edcd7aa58fe079872903a30c Mon Sep 17 00:00:00 2001 From: Chen Zhang <zhangch99@outlook.com> Date: Tue, 26 Aug 2025 21:59:14 -0700 Subject: [PATCH 068/112] [Frontend] Optimize beam search performance by limiting concurrency (#23599) Signed-off-by: Chen Zhang <zhangch99@outlook.com> --- benchmarks/benchmark_throughput.py | 1 - tests/conftest.py | 8 +- tests/samplers/test_beam_search.py | 53 +++++++++++ vllm/entrypoints/llm.py | 138 ++++++++++++++++------------- 4 files changed, 136 insertions(+), 64 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index c7f290e1eb88e..6b24b8c8f3c67 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -96,7 +96,6 @@ def run_vllm( end = time.perf_counter() else: assert lora_requests is None, "BeamSearch API does not support LoRA" - prompts = [request.prompt for request in requests] # output_len should be the same for all requests. output_len = requests[0].expected_output_len for request in requests: diff --git a/tests/conftest.py b/tests/conftest.py index 2bf88abb0f6c2..f8bfdfc8e6259 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1022,15 +1022,17 @@ class VllmRunner: images: Optional[PromptImageInput] = None, videos: Optional[PromptVideoInput] = None, audios: Optional[PromptAudioInput] = None, + concurrency_limit: Optional[int] = None, ) -> list[tuple[list[list[int]], list[str]]]: inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios) - outputs = self.llm.beam_search( - inputs, - BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens)) + outputs = self.llm.beam_search(inputs, + BeamSearchParams(beam_width=beam_width, + max_tokens=max_tokens), + concurrency_limit=concurrency_limit) returned_outputs = [] for output in outputs: token_ids = [x.tokens for x in output.sequences] diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py index bdf48c7687b25..cc9a88a255f9f 100644 --- a/tests/samplers/test_beam_search.py +++ b/tests/samplers/test_beam_search.py @@ -67,6 +67,59 @@ def test_beam_search_single_input( f"vLLM: {vllm_output_ids}") +@pytest.mark.skip_v1 # FIXME: This fails on V1 right now. +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("max_tokens", MAX_TOKENS) +@pytest.mark.parametrize("beam_width", BEAM_WIDTHS) +def test_beam_search_with_concurrency_limit( + hf_runner, + vllm_runner, + example_prompts, + model: str, + dtype: str, + max_tokens: int, + beam_width: int, +) -> None: + # example_prompts[1]&[3]&[7] fails due to unknown reason even without + # concurency limit. skip them for now. + example_prompts = (example_prompts[:8]) + concurrency_limit = 2 + assert len(example_prompts) > concurrency_limit + with vllm_runner(model, dtype=dtype) as vllm_model: + outputs_with_limit = vllm_model.generate_beam_search( + example_prompts, + beam_width, + max_tokens, + concurrency_limit=concurrency_limit) + outputs_without_limit = [] + + for i in range(0, len(example_prompts), concurrency_limit): + outputs_without_limit.extend( + vllm_model.generate_beam_search( + example_prompts[i:i + concurrency_limit], beam_width, + max_tokens)) + + correct = True + for i in range(len(example_prompts)): + output_ids_with_limit, output_texts_with_limit = outputs_with_limit[i] + output_ids_without_limit, output_texts_without_limit = ( + outputs_without_limit[i]) + for j, (text_with_limit, text_without_limit) in enumerate( + zip(output_texts_with_limit, output_texts_without_limit)): + print(f">>>{j}-th with limit output:") + print(text_with_limit) + print(f">>>{j}-th without limit output:") + print(text_without_limit) + assert len(output_ids_with_limit) == len(output_ids_without_limit) + for j in range(len(output_ids_with_limit)): + if output_ids_with_limit[j] != output_ids_without_limit[j]: + print(f"Test{i} output{j}:\n+limit: {output_ids_with_limit}\n" + f"-limit: {output_ids_without_limit}") + correct = False + assert correct + + @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", MAX_TOKENS) @pytest.mark.parametrize("beam_width", MM_BEAM_WIDTHS) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 8816ff56d6840..72b6123670b70 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -523,6 +523,7 @@ class LLM: params: BeamSearchParams, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, use_tqdm: bool = False, + concurrency_limit: Optional[int] = None, ) -> list[BeamSearchOutput]: """ Generate sequences using beam search. @@ -533,6 +534,8 @@ class LLM: params: The beam search parameters. lora_request: LoRA request to use for generation, if any. use_tqdm: Whether to use tqdm to display the progress bar. + concurrency_limit: The maximum number of concurrent requests. + If None, the number of concurrent requests is unlimited. """ # TODO: how does beam search work together with length penalty, # frequency, penalty, and stopping criteria, etc.? @@ -551,6 +554,15 @@ class LLM: length_penalty, ) + if use_tqdm and concurrency_limit is not None: + logger.warning( + "Progress bar is not supported when using concurrency_limit. " + "Disabling progress bar.") + use_tqdm = False + + if concurrency_limit is None: + concurrency_limit = len(prompts) + def create_tokens_prompt_from_beam( beam: BeamSearchSequence) -> TokensPrompt: token_prompt_kwargs: TokensPrompt = { @@ -595,73 +607,79 @@ class LLM: **mm_kwargs, ), ) - token_iter = range(max_tokens) - if use_tqdm: - token_iter = tqdm(token_iter, - desc="Beam search", - unit="token", - unit_scale=False) - logger.warning( - "The progress bar shows the upper bound on token steps and " - "may finish early due to stopping conditions. It does not " - "reflect instance-level progress.") + for prompt_start in range(0, len(prompts), concurrency_limit): + instances_batch = instances[prompt_start:prompt_start + + concurrency_limit] - for _ in token_iter: - all_beams: list[BeamSearchSequence] = list( - sum((instance.beams for instance in instances), [])) - pos = [0] + list( - itertools.accumulate( - len(instance.beams) for instance in instances)) - instance_start_and_end: list[tuple[int, int]] = list( - zip(pos[:-1], pos[1:])) + token_iter = range(max_tokens) + if use_tqdm: + token_iter = tqdm(token_iter, + desc="Beam search", + unit="token", + unit_scale=False) + logger.warning( + "The progress bar shows the upper bound on token steps and " + "may finish early due to stopping conditions. It does not " + "reflect instance-level progress.") + for _ in token_iter: + all_beams: list[BeamSearchSequence] = list( + sum((instance.beams for instance in instances_batch), [])) + pos = [0] + list( + itertools.accumulate( + len(instance.beams) for instance in instances_batch)) + instance_start_and_end: list[tuple[int, int]] = list( + zip(pos[:-1], pos[1:])) - if len(all_beams) == 0: - break + if len(all_beams) == 0: + break - # create the corresponding batch entries for prompt & optional lora - prompts_batch, lora_req_batch = zip( - *[(create_tokens_prompt_from_beam(beam), beam.lora_request) - for beam in all_beams]) + # create corresponding batch entries for prompt & optional lora + prompts_batch, lora_req_batch = zip( + *[(create_tokens_prompt_from_beam(beam), beam.lora_request) + for beam in all_beams]) - # only runs for one step - # we don't need to use tqdm here - output = self.generate(prompts_batch, - sampling_params=beam_search_params, - use_tqdm=False, - lora_request=lora_req_batch) + # only runs for one step + # we don't need to use tqdm here + output = self.generate(prompts_batch, + sampling_params=beam_search_params, + use_tqdm=False, + lora_request=lora_req_batch) - for (start, end), instance in zip(instance_start_and_end, - instances): - instance_new_beams = [] - for i in range(start, end): - current_beam = all_beams[i] - result = output[i] + for (start, end), instance in zip(instance_start_and_end, + instances_batch): + instance_new_beams = [] + for i in range(start, end): + current_beam = all_beams[i] + result = output[i] - if result.outputs[0].logprobs is not None: - # if `result.outputs[0].logprobs` is None, it means - # the sequence is completed because of the max-model-len - # or abortion. we don't need to add it to the new beams. - logprobs = result.outputs[0].logprobs[0] - for token_id, logprob_obj in logprobs.items(): - new_beam = BeamSearchSequence( - tokens=current_beam.tokens + [token_id], - logprobs=current_beam.logprobs + [logprobs], - lora_request=current_beam.lora_request, - cum_logprob=current_beam.cum_logprob + - logprob_obj.logprob, - multi_modal_data=current_beam.multi_modal_data, - mm_processor_kwargs=current_beam. - mm_processor_kwargs) + if result.outputs[0].logprobs is not None: + # if `result.outputs[0].logprobs` is None, it means + # the sequence is completed because of the + # max-model-len or abortion. we don't need to add + # it to the new beams. + logprobs = result.outputs[0].logprobs[0] + for token_id, logprob_obj in logprobs.items(): + new_beam = BeamSearchSequence( + tokens=current_beam.tokens + [token_id], + logprobs=current_beam.logprobs + + [logprobs], + lora_request=current_beam.lora_request, + cum_logprob=current_beam.cum_logprob + + logprob_obj.logprob, + multi_modal_data=current_beam. + multi_modal_data, + mm_processor_kwargs=current_beam. + mm_processor_kwargs) - if token_id == tokenizer.eos_token_id and \ - not ignore_eos: - instance.completed.append(new_beam) - else: - instance_new_beams.append(new_beam) - sorted_beams = sorted(instance_new_beams, - key=sort_beams_key, - reverse=True) - instance.beams = sorted_beams[:beam_width] + if token_id == tokenizer.eos_token_id and \ + not ignore_eos: + instance.completed.append(new_beam) + else: + instance_new_beams.append(new_beam) + sorted_beams = sorted(instance_new_beams, + key=sort_beams_key, + reverse=True) + instance.beams = sorted_beams[:beam_width] outputs = [] for instance in instances: From d272415e57c95da63c798c22c7d87cc5c0cda21f Mon Sep 17 00:00:00 2001 From: Dipika Sikka <dipikasikka1@gmail.com> Date: Wed, 27 Aug 2025 01:00:21 -0400 Subject: [PATCH 069/112] [Quantization] Expand compressed-tensors MoE matching logic to support NFP4 + FP8 MoEs (#22674) Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com> Signed-off-by: Dipika <dipikasikka1@gmail.com> --- .../compressed_tensors/compressed_tensors.py | 13 +++---- .../compressed_tensors_moe.py | 36 +++++++++++++++++-- 2 files changed, 40 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index ce74375aab426..245cf122ebab1 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -425,6 +425,10 @@ class CompressedTensorsConfig(QuantizationConfig): weight_quant: BaseModel, input_quant: BaseModel, format: Optional[str] = None) -> "CompressedTensorsScheme": + + # use the per-layer format if defined, otherwise, use global format + format = format if format is not None else self.quant_format + # Detect If Mixed Precision if self._is_fp4a16_nvfp4(weight_quant, input_quant): return CompressedTensorsW4A16Fp4() @@ -437,14 +441,14 @@ class CompressedTensorsConfig(QuantizationConfig): actorder=weight_quant.actorder) if self._is_wNa16_group_channel(weight_quant, input_quant): - if (self.quant_format == CompressionFormat.marlin_24.value + if (format == CompressionFormat.marlin_24.value and weight_quant.num_bits in W4A16SPARSE24_SUPPORTED_BITS): assert weight_quant.symmetric return CompressedTensorsW4A16Sparse24( strategy=weight_quant.strategy, num_bits=weight_quant.num_bits, group_size=weight_quant.group_size) - if (self.quant_format == CompressionFormat.pack_quantized.value + if (format == CompressionFormat.pack_quantized.value and weight_quant.num_bits in WNA16_SUPPORTED_BITS): return CompressedTensorsWNA16( num_bits=weight_quant.num_bits, @@ -453,10 +457,7 @@ class CompressedTensorsConfig(QuantizationConfig): group_size=weight_quant.group_size, actorder=weight_quant.actorder) - act_quant_format = is_activation_quantization_format( - format - ) if format is not None else is_activation_quantization_format( - self.quant_format) + act_quant_format = is_activation_quantization_format(format) if act_quant_format: if self._is_fp4a4_nvfp4(weight_quant, input_quant): if cutlass_fp4_supported( diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 1ee3478aa4f43..6279bb8b60570 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -22,6 +22,8 @@ from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( is_valid_flashinfer_cutlass_fused_moe) from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16 import ( # noqa WNA16_SUPPORTED_BITS, WNA16_SUPPORTED_TYPES_MAP) +from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( + find_matched_target) from vllm.model_executor.layers.quantization.utils import replace_parameter from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( build_flashinfer_fp4_cutlass_moe_prepare_finalize, reorder_w1w3_to_w3w1, @@ -65,12 +67,40 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase): @staticmethod def get_moe_method( quant_config: "CompressedTensorsConfig", # type: ignore # noqa E501 - layer: torch.nn.Module, + layer: torch.nn.Module ) -> "CompressedTensorsMoEMethod": # TODO: @dsikka: refactor this to use schemes as other kernels # are supported + check if the layer is being ignored. - weight_quant = quant_config.target_scheme_map["Linear"].get("weights") - input_quant = quant_config.target_scheme_map["Linear"].get( + # Check if a using "Linear" to select scheems + if "Linear" in quant_config.target_scheme_map: + matched_target = "Linear" + else: + # May have instead defined the linear layers in the fused model + + fused_layers = [ + "re:.*down_proj.*", "re:.*gate_proj.*", "re:.*up_proj.*" + ] + current_scheme = None + for fused_layer in fused_layers: + # Check if one of the fused layers are defined in quant_config + matched_target = find_matched_target( + layer_name=fused_layer, + module=layer, + targets=quant_config.target_scheme_map.keys(), + fused_mapping=quant_config.packed_modules_mapping) + + # Only valid if down_proj, gate_proj, and up_proj + # are mapped to the same quant scheme in the quant_config + if current_scheme is None: + current_scheme = quant_config.target_scheme_map.get( + matched_target) + else: + assert current_scheme == quant_config.target_scheme_map.get( + matched_target) + + weight_quant = quant_config.target_scheme_map[matched_target].get( + "weights") + input_quant = quant_config.target_scheme_map[matched_target].get( "input_activations") if quant_config._is_wNa16_group_channel(weight_quant, input_quant): From fce10dbed5441b4f918b23a2b63aae72bc00a2f6 Mon Sep 17 00:00:00 2001 From: Kunshang Ji <kunshang.ji@intel.com> Date: Wed, 27 Aug 2025 13:33:27 +0800 Subject: [PATCH 070/112] [XPU] Add xpu torch.compile support (#22609) Signed-off-by: Kunshang Ji <kunshang.ji@intel.com> --- .buildkite/scripts/hardware_ci/run-xpu-test.sh | 1 + vllm/attention/layer.py | 3 +-- vllm/compilation/fix_functionalization.py | 8 ++++++++ vllm/platforms/cpu.py | 4 ++++ vllm/platforms/cuda.py | 4 ++++ vllm/platforms/interface.py | 8 ++++++++ vllm/platforms/rocm.py | 4 ++++ vllm/platforms/xpu.py | 15 ++++++--------- 8 files changed, 36 insertions(+), 11 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh index 445cd2735c190..73f3e63fbf5f6 100644 --- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh @@ -31,6 +31,7 @@ docker run \ set -e echo $ZE_AFFINITY_MASK VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager + VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp cd tests diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 2d288bcbe0c95..237802afccde9 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -190,8 +190,7 @@ class Attention(nn.Module, AttentionLayerBase): # torch.compile works by registering the attention as one giant # opaque custom op. For other platforms, we directly call them # and let torch.compile handle them. - self.use_direct_call = not current_platform.is_cuda_alike( - ) and not current_platform.is_cpu() + self.use_direct_call = not current_platform.opaque_attention_op() self.use_output = self.attn_backend.accept_output_buffer compilation_config = get_current_vllm_config().compilation_config diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py index 286221d32c1ee..60ae143318790 100644 --- a/vllm/compilation/fix_functionalization.py +++ b/vllm/compilation/fix_functionalization.py @@ -9,6 +9,7 @@ import torch from torch._higher_order_ops.auto_functionalize import auto_functionalized from vllm.logger import init_logger +from vllm.platforms import current_platform from .fx_utils import is_func from .vllm_inductor_pass import VllmInductorPass @@ -26,6 +27,13 @@ class FixFunctionalizationPass(VllmInductorPass): """ def __call__(self, graph: torch.fx.Graph): + # XPU does not support auto-functionalization yet. + # Will enable this when switch to vllm-xpu-kernels. + if current_platform.is_xpu(): + logger.debug("XPU platform does not support fix functionalization" + "pass currently.") + return + self.begin() self.dump_graph(graph, "before_fix_functionalization") diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index c748595a71534..5686fae5cd7d1 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -335,3 +335,7 @@ class CpuPlatform(Platform): return (cls.supports_v1(model_config) and arch in (CpuArchEnum.X86, CpuArchEnum.POWERPC, CpuArchEnum.ARM, CpuArchEnum.S390X)) + + @classmethod + def opaque_attention_op(cls) -> bool: + return True diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index c0e0fe35e4024..5cbb7346436ef 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -442,6 +442,10 @@ class CudaPlatformBase(Platform): def use_custom_allreduce(cls) -> bool: return True + @classmethod + def opaque_attention_op(cls) -> bool: + return True + @classmethod def get_static_graph_wrapper_cls(cls) -> str: return "vllm.compilation.cuda_graph.CUDAGraphWrapper" diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index f6c17de86d05a..01f3e2d977bc3 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -509,6 +509,14 @@ class Platform: """ return False + @classmethod + def opaque_attention_op(cls) -> bool: + """ + Returns True if we register attention as one giant opaque custom op + on the current platform + """ + return False + @classmethod def validate_request( cls, diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 85b2fe2e480c8..c6d14aa87c7f2 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -411,6 +411,10 @@ class RocmPlatform(Platform): supported_archs = ['gfx94', 'gfx95'] return any(gfx in gcn_arch for gfx in supported_archs) + @classmethod + def opaque_attention_op(cls) -> bool: + return True + @classmethod def get_cu_count(cls, device_id: int = 0) -> int: return torch.cuda.get_device_properties( diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 235e5d8294e52..84f4cd7256465 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -90,21 +90,14 @@ class XPUPlatform(Platform): if cache_config and cache_config.block_size is None: cache_config.block_size = 64 - # FIXME: Temporarily forcing eager mode - # remove after t.compile support stabilizes. - if (envs.VLLM_USE_V1 and model_config is not None - and not vllm_config.model_config.enforce_eager): - from vllm.config import CompilationLevel - vllm_config.compilation_config.level = CompilationLevel.NO_COMPILATION # noqa: E501 - # lazy import to avoid circular import from vllm.config import CUDAGraphMode compilation_config = vllm_config.compilation_config if compilation_config.cudagraph_mode is None or \ compilation_config.cudagraph_mode.max_cudagraph_mode() \ != CUDAGraphMode.NONE: - logger.info("[XPU] CUDA graph is not supported on XPU, " - "disabling cudagraphs.") + logger.info("[XPU] CUDA graph is not supported on XPU, disabling " + "cudagraphs. Fallback to cudagraph_mode=NONE") compilation_config.cudagraph_mode = CUDAGraphMode.NONE # check and update parallel config @@ -182,3 +175,7 @@ class XPUPlatform(Platform): "Intel Arc A770 have bfloat16 accuracy known issue. " "You can use float16 instead by explicitly setting the " "`dtype` flag in CLI, for example: --dtype=half.") + + @classmethod + def opaque_attention_op(cls) -> bool: + return True From 9de25c294b92e42a12d1fbbb3ab3f633fa80291c Mon Sep 17 00:00:00 2001 From: Jee Jee Li <pandaleefree@gmail.com> Date: Wed, 27 Aug 2025 13:51:50 +0800 Subject: [PATCH 071/112] [CI/Build] Remove redundant LoRA model tests (#23706) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> --- tests/lora/conftest.py | 5 -- tests/lora/test_baichuan.py | 112 ------------------------------------ tests/lora/test_phi.py | 71 ----------------------- 3 files changed, 188 deletions(-) delete mode 100644 tests/lora/test_baichuan.py delete mode 100644 tests/lora/test_phi.py diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index cba573b63c045..3475993ff8f07 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -216,11 +216,6 @@ def tinyllama_lora_files(): return snapshot_download(repo_id="jashing/tinyllama-colorist-lora") -@pytest.fixture(scope="session") -def phi2_lora_files(): - return snapshot_download(repo_id="isotr0py/phi-2-test-sql-lora") - - @pytest.fixture def reset_default_device(): """ diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py deleted file mode 100644 index 774ebb9db2106..0000000000000 --- a/tests/lora/test_baichuan.py +++ /dev/null @@ -1,112 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest - -import vllm -from vllm.distributed import cleanup_dist_env_and_memory -from vllm.lora.request import LoRARequest - -MODEL_PATH = "baichuan-inc/Baichuan-7B" - -PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501 - - -def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: - prompts = [ - PROMPT_TEMPLATE.format(query="How many singers do we have?"), - PROMPT_TEMPLATE.format( - query= - "What is the average, minimum, and maximum age of all singers from France?" # noqa: E501 - ), - PROMPT_TEMPLATE.format( - query= - "Show name, country, age for all singers ordered by age from the oldest to the youngest." # noqa: E501 - ), - ] - print(prompts) - sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256) - outputs = llm.generate( - prompts, - sampling_params, - lora_request=LoRARequest(str(lora_id), lora_id, lora_path) - if lora_id else None) - # Print the outputs. - generated_texts: list[str] = [] - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text.strip() - generated_texts.append(generated_text) - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - return generated_texts - - -def test_baichuan_lora(baichuan_lora_files): - llm = vllm.LLM(MODEL_PATH, - max_model_len=1024, - enable_lora=True, - max_loras=4, - max_lora_rank=64, - trust_remote_code=True) - - expected_lora_output = [ - "SELECT count(*) FROM singer", - "SELECT avg(age) , min(age) , max(age) FROM singer WHERE Country = 'France'", # noqa: E501 - "SELECT name , country , age FROM singer ORDER BY age ASC", - ] - - output1 = do_sample(llm, baichuan_lora_files, lora_id=1) - for i in range(len(expected_lora_output)): - assert output1[i] == expected_lora_output[i] - output2 = do_sample(llm, baichuan_lora_files, lora_id=2) - for i in range(len(expected_lora_output)): - assert output2[i] == expected_lora_output[i] - - -@pytest.mark.parametrize("fully_sharded", [True, False]) -def test_baichuan_tensor_parallel_equality(baichuan_lora_files, - num_gpus_available, fully_sharded): - if num_gpus_available < 4: - pytest.skip(f"Not enough GPUs for tensor parallelism {4}") - - llm_tp1 = vllm.LLM(MODEL_PATH, - enable_lora=True, - max_num_seqs=16, - max_loras=4, - max_lora_rank=64, - trust_remote_code=True, - fully_sharded_loras=fully_sharded) - output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1) - - del llm_tp1 - cleanup_dist_env_and_memory() - - llm_tp2 = vllm.LLM(MODEL_PATH, - enable_lora=True, - max_num_seqs=16, - max_loras=4, - max_lora_rank=64, - tensor_parallel_size=2, - trust_remote_code=True, - fully_sharded_loras=fully_sharded) - output_tp2 = do_sample(llm_tp2, baichuan_lora_files, lora_id=2) - - del llm_tp2 - cleanup_dist_env_and_memory() - - assert output_tp1 == output_tp2 - - llm_tp4 = vllm.LLM(MODEL_PATH, - enable_lora=True, - max_num_seqs=16, - max_loras=4, - max_lora_rank=64, - tensor_parallel_size=4, - trust_remote_code=True, - fully_sharded_loras=fully_sharded) - output_tp4 = do_sample(llm_tp4, baichuan_lora_files, lora_id=2) - - del llm_tp4 - cleanup_dist_env_and_memory() - - assert output_tp1 == output_tp4 diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py deleted file mode 100644 index 3090941e63679..0000000000000 --- a/tests/lora/test_phi.py +++ /dev/null @@ -1,71 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import vllm -from vllm.lora.request import LoRARequest - -MODEL_PATH = "microsoft/phi-2" - -PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:" # noqa: E501 - - -def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: - prompts = [ - PROMPT_TEMPLATE.format( - sql_prompt= - "Which catalog publisher has published the most catalogs?", - context="CREATE TABLE catalogs (catalog_publisher VARCHAR);"), - PROMPT_TEMPLATE.format( - sql_prompt= - "Which trip started from the station with the largest dock count? Give me the trip id.", # noqa: E501 - context= - "CREATE TABLE trip (id VARCHAR, start_station_id VARCHAR); CREATE TABLE station (id VARCHAR, dock_count VARCHAR);" # noqa: E501 - ), - PROMPT_TEMPLATE.format( - sql_prompt= - "How many marine species are found in the Southern Ocean?", # noqa: E501 - context= - "CREATE TABLE marine_species (name VARCHAR(50), common_name VARCHAR(50), location VARCHAR(50));" # noqa: E501 - ), - ] - sampling_params = vllm.SamplingParams(temperature=0, - max_tokens=64, - stop="### End") - outputs = llm.generate( - prompts, - sampling_params, - lora_request=LoRARequest(str(lora_id), lora_id, lora_path) - if lora_id else None, - ) - # Print the outputs. - generated_texts: list[str] = [] - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text.strip() - generated_texts.append(generated_text) - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - return generated_texts - - -def test_phi2_lora(phi2_lora_files): - # We enable enforce_eager=True here to reduce VRAM usage for lora-test CI, - # Otherwise, the lora-test will fail due to CUDA OOM. - llm = vllm.LLM(MODEL_PATH, - max_model_len=1024, - enable_lora=True, - max_loras=2, - enforce_eager=True, - enable_chunked_prefill=True) - - expected_lora_output = [ - "SELECT catalog_publisher, COUNT(*) as num_catalogs FROM catalogs GROUP BY catalog_publisher ORDER BY num_catalogs DESC LIMIT 1;", # noqa: E501 - "SELECT trip.id FROM trip JOIN station ON trip.start_station_id = station.id WHERE station.dock_count = (SELECT MAX(dock_count) FROM station);", # noqa: E501 - "SELECT COUNT(*) FROM marine_species WHERE location = 'Southern Ocean';", # noqa: E501 - ] - - output1 = do_sample(llm, phi2_lora_files, lora_id=1) - for i in range(len(expected_lora_output)): - assert output1[i].startswith(expected_lora_output[i]) - output2 = do_sample(llm, phi2_lora_files, lora_id=2) - for i in range(len(expected_lora_output)): - assert output2[i].startswith(expected_lora_output[i]) From 8dbf6ed7be3f8602257ce1879825d4b5e3554d67 Mon Sep 17 00:00:00 2001 From: "rongfu.leng" <rongfu.leng@daocloud.io> Date: Wed, 27 Aug 2025 13:54:39 +0800 Subject: [PATCH 072/112] [Bugfix] fix when config.yaml config value is list parse error (#23528) Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io> --- tests/utils_/test_utils.py | 41 ++++++++++++++++++++++++++++++++++++++ vllm/utils/__init__.py | 9 +++++++-- 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/tests/utils_/test_utils.py b/tests/utils_/test_utils.py index 084d82dee11b3..04195ea0cf92e 100644 --- a/tests/utils_/test_utils.py +++ b/tests/utils_/test_utils.py @@ -5,13 +5,17 @@ import asyncio import hashlib import json +import os import pickle import socket +import tempfile from collections.abc import AsyncIterator +from pathlib import Path from unittest.mock import patch import pytest import torch +import yaml import zmq from transformers import AutoTokenizer from vllm_test_utils.monitor import monitor @@ -991,3 +995,40 @@ def test_current_stream_multithread(): child_thread.join(timeout=5) if child_thread.is_alive(): pytest.fail("Child thread failed to exit properly") + + +def test_load_config_file(tmp_path): + # Define the configuration data + config_data = { + "enable-logging": True, + "list-arg": ["item1", "item2"], + "port": 12323, + "tensor-parallel-size": 4 + } + + # Write the configuration data to a temporary YAML file + config_file_path = tmp_path / "config.yaml" + with open(config_file_path, "w") as config_file: + yaml.dump(config_data, config_file) + + # Initialize the parser + parser = FlexibleArgumentParser() + + # Call the function with the temporary file path + processed_args = parser.load_config_file(str(config_file_path)) + + # Expected output + expected_args = [ + "--enable-logging", + "--list-arg", + "item1", + "item2", + "--port", + "12323", + "--tensor-parallel-size", + "4", + ] + + # Assert that the processed arguments match the expected output + assert processed_args == expected_args + os.remove(str(config_file_path)) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 7c34a858c0a21..60bddc5b500b5 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -1974,7 +1974,7 @@ class FlexibleArgumentParser(ArgumentParser): file_path = args[index + 1] - config_args = self._load_config_file(file_path) + config_args = self.load_config_file(file_path) # 0th index is for {serve,chat,complete} # optionally followed by model_tag (only for serve) @@ -2005,7 +2005,7 @@ class FlexibleArgumentParser(ArgumentParser): return args - def _load_config_file(self, file_path: str) -> list[str]: + def load_config_file(self, file_path: str) -> list[str]: """Loads a yaml file and returns the key value pairs as a flattened list with argparse like pattern ```yaml @@ -2046,6 +2046,11 @@ class FlexibleArgumentParser(ArgumentParser): if isinstance(value, bool) and key not in store_boolean_arguments: if value: processed_args.append('--' + key) + elif isinstance(value, list): + if value: + processed_args.append('--' + key) + for item in value: + processed_args.append(str(item)) else: processed_args.append('--' + key) processed_args.append(str(value)) From 69244e67e6822f1c15816f887659e1ccc18c2632 Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Wed, 27 Aug 2025 14:19:13 +0800 Subject: [PATCH 073/112] [Core] Use key-only cache for `BaseMultiModalProcessor` (#23018) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- docs/configuration/conserving_memory.md | 2 +- docs/configuration/optimization.md | 44 +- .../multimodal/processing/test_common.py | 8 +- tests/multimodal/test_cache.py | 182 +++++++- vllm/config/__init__.py | 26 +- vllm/engine/arg_utils.py | 14 +- vllm/engine/llm_engine.py | 15 +- vllm/inputs/preprocess.py | 22 +- vllm/inputs/registry.py | 12 +- .../models/hyperclovax_vision.py | 7 +- vllm/model_executor/models/llava.py | 8 +- vllm/model_executor/models/minicpmv.py | 40 +- vllm/model_executor/models/mistral3.py | 8 +- vllm/model_executor/models/phi3v.py | 20 +- vllm/model_executor/models/phi4mm.py | 21 +- vllm/model_executor/models/tarsier.py | 7 +- vllm/multimodal/cache.py | 405 +++++++++++++++++- vllm/multimodal/inputs.py | 38 +- vllm/multimodal/processing.py | 187 ++++---- vllm/multimodal/profiling.py | 4 +- vllm/multimodal/registry.py | 90 ++-- vllm/v1/engine/async_llm.py | 3 +- vllm/v1/engine/core.py | 17 +- vllm/v1/engine/llm_engine.py | 3 +- vllm/v1/engine/mm_input_cache.py | 121 ------ vllm/v1/engine/processor.py | 29 +- vllm/v1/worker/gpu_model_runner.py | 3 + vllm/v1/worker/tpu_model_runner.py | 3 + vllm/v1/worker/utils.py | 9 +- 29 files changed, 954 insertions(+), 394 deletions(-) delete mode 100644 vllm/v1/engine/mm_input_cache.py diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md index 058eba5fe0b1e..efda9c8e019eb 100644 --- a/docs/configuration/conserving_memory.md +++ b/docs/configuration/conserving_memory.md @@ -86,7 +86,7 @@ llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", If you run out of CPU RAM, try the following options: -- (Multi-modal models only) you can set the size of multi-modal processor cache by setting `mm_processor_cache_gb` engine argument (default 4 GiB per API process + 4 GiB per engine core process) +- (Multi-modal models only) you can set the size of multi-modal cache by setting `mm_processor_cache_gb` engine argument (default 4 GiB). - (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB). ## Multi-modal input limits diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index bb47e1b90f086..3eaf2185a559e 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -204,20 +204,33 @@ vllm serve Qwen/Qwen2.5-VL-3B-Instruct --api-server-count 4 -dp 2 to avoid CPU resource exhaustion. !!! note - [Multi-modal processor cache](#processor-cache) is disabled when API server scale-out is enabled - because it requires a one-to-one correspondence between API and engine core processes. + API server scale-out disables [multi-modal IPC caching](#ipc-caching) + because it requires a one-to-one correspondance between API and engine core processes. + + This does not impact [multi-modal processor caching](#processor-caching). ## Multi-Modal Caching -### Processor Cache - -By default, the multi-modal processor cache is enabled to avoid repeatedly processing -the same multi-modal inputs via Hugging Face `AutoProcessor`, +Multi-modal caching avoids repeated transfer or processing of the same multi-modal data, which commonly occurs in multi-turn conversations. -You can adjust the size of the cache by setting the value of `mm_processor_cache_gb` -(default 4 GiB per API process + 4 GiB per engine core process). -If you do not benefit much from the cache, you can disable it completely via `mm_processor_cache_gb=0`. +### Processor Caching + +Multi-modal processor caching is automatically enabled +to avoid repeatedly processing the same multi-modal inputs in `BaseMultiModalProcessor`. + +### IPC Caching + +Multi-modal IPC caching is automatically enabled when +there is a one-to-one correspondance between API (`P0`) and engine core (`P1`) processes, +to avoid repeatedly transferring the same multi-modal inputs between them. + +### Configuration + +You can adjust the size of the cache by setting the value of `mm_processor_cache_gb` (default 4 GiB). + +If you do not benefit much from the cache, you can disable both IPC +and processor caching completely via `mm_processor_cache_gb=0`. Examples: @@ -230,3 +243,16 @@ llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", mm_processor_cache_gb=0) ``` + +### Cache Placement + +Based on the configuration, the content of the multi-modal caches on `P0` and `P1` are as follows: + +| Processor Caching | IPC Caching | `P0` Cache | `P1` Cache | Max. Memory | +|-------------------|-------------|------------|------------|-------------| +| ✅ | ✅ | K | K + V | `mm_processor_cache_gb * data_parallel_size` | +| ✅ | ❌ | K + V | N/A | `mm_processor_cache_gb * api_server_count` | +| ❌ | ❌ | N/A | N/A | `0` | + +K: Stores the hashes of multi-modal items +V: Stores the processed tensor data of multi-modal items diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 6361cb9b5586a..3ff4360b83345 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -14,8 +14,9 @@ from PIL import Image from vllm.config import ModelConfig from vllm.inputs import InputProcessingContext from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict +from vllm.multimodal.cache import MultiModalProcessorOnlyCache from vllm.multimodal.inputs import MultiModalInputs -from vllm.multimodal.processing import BaseMultiModalProcessor, ProcessingCache +from vllm.multimodal.processing import BaseMultiModalProcessor from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer, cached_tokenizer_from_config, encode_tokens) @@ -63,6 +64,8 @@ def _test_processing_correctness( revision=model_info.revision, trust_remote_code=model_info.trust_remote_code, hf_overrides=model_info.hf_overrides, + # Ensure that the cache can fit all of the data + mm_processor_cache_gb=2048, ) model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) @@ -71,8 +74,7 @@ def _test_processing_correctness( model_config, tokenizer=cached_tokenizer_from_config(model_config), ) - # Ensure that it can fit all of the data - cache = ProcessingCache(capacity_gb=2048) + cache = MultiModalProcessorOnlyCache(model_config) processing_info = factories.info(ctx) supported_mm_limits = processing_info.get_supported_mm_limits() diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py index 088cd00db2e04..44c05db2278f7 100644 --- a/tests/multimodal/test_cache.py +++ b/tests/multimodal/test_cache.py @@ -1,32 +1,64 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Optional + +import numpy as np import pytest import torch -from vllm.multimodal.cache import MultiModalCache, MultiModalCacheItemMetadata +from vllm.config import ModelConfig, ParallelConfig, VllmConfig +from vllm.multimodal.cache import (MultiModalCache, + MultiModalProcessorCacheItem, + MultiModalProcessorCacheItemMetadata, + processor_cache_from_config, + receiver_cache_from_config) +from vllm.multimodal.hasher import MultiModalHasher from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargsItem, MultiModalKwargsItems, MultiModalSharedField) +from vllm.multimodal.processing import PromptInsertion +from vllm.multimodal.registry import MultiModalRegistry -def _dummy_elem(modality: str, key: str, size: int): +def _dummy_elem( + modality: str, + key: str, + size: int, + *, + rng: Optional[np.random.RandomState] = None, +): + if rng is None: + data = torch.empty((size, ), dtype=torch.int8) + else: + data = torch.from_numpy(rng.randint(4, size=(size, ), dtype=np.int8)) + return MultiModalFieldElem( modality=modality, key=key, - data=torch.empty((size, ), dtype=torch.int8), + data=data, field=MultiModalSharedField(1), ) -def _dummy_item(modality: str, size_by_key: dict[str, int]): +def _dummy_item( + modality: str, + size_by_key: dict[str, int], + *, + rng: Optional[np.random.RandomState] = None, +): return MultiModalKwargsItem.from_elems([ - _dummy_elem(modality, key, size) for key, size in size_by_key.items() + _dummy_elem(modality, key, size, rng=rng) + for key, size in size_by_key.items() ]) -def _dummy_items(size_by_key_modality: dict[str, dict[str, int]]): +def _dummy_items( + size_by_key_modality: dict[str, dict[str, int]], + *, + rng: Optional[np.random.RandomState] = None, +): return MultiModalKwargsItems.from_seq([ - _dummy_item(modality, size_by_key) + _dummy_item(modality, size_by_key, rng=rng) for modality, size_by_key in size_by_key_modality.items() ]) @@ -48,5 +80,139 @@ def test_cache_item_size(item, expected_size): cache[""] = item assert cache.currsize == expected_size - cache[""] = MultiModalCacheItemMetadata.wraps(item) + prompt_update = PromptInsertion("dummy", "target", "insertion") \ + .resolve(0) + + cache[""] = MultiModalProcessorCacheItem(item, [prompt_update]) assert cache.currsize == expected_size + + cache[""] = MultiModalProcessorCacheItemMetadata(item, [prompt_update]) + assert cache.currsize == expected_size + + +def _create_vllm_config( + *, + mm_processor_cache_gb: float, + enable_ipc: bool, +): + return VllmConfig( + model_config=ModelConfig(mm_processor_cache_gb=mm_processor_cache_gb), + parallel_config=ParallelConfig( + data_parallel_size=1 if enable_ipc else 2), + ) + + +def _compare_caches( + config_0: VllmConfig, + config_1: VllmConfig, + *, + item_capacity: int = 8, + hit_rate: float = 0.5, + max_items_per_iter: int = 3, + is_cached_calls_per_iter: int, + n_iter: int = 100, + seed: int = 0, +): + mm_registry = MultiModalRegistry() + cache_0_p0 = processor_cache_from_config(config_0, mm_registry) + cache_0_p1 = receiver_cache_from_config(config_0, mm_registry) + cache_1_p0 = processor_cache_from_config(config_1, mm_registry) + cache_1_p1 = receiver_cache_from_config(config_1, mm_registry) + + cache_size_gb = max( + config_0.model_config.mm_processor_cache_gb, + config_1.model_config.mm_processor_cache_gb, + ) + item_size_gb = int(cache_size_gb / item_capacity) + + rng = np.random.RandomState(seed) + all_items = [ + _dummy_item("item", {"key": item_size_gb}, rng=rng) + for _ in range(int(item_capacity / hit_rate)) + ] + all_hashes = [ + MultiModalHasher.hash_kwargs(item=item.get_data()) + for item in all_items + ] + + # Should not be used since there is nothing to convert to text + prompt_update = PromptInsertion("dummy", "target", "insertion") + + for it in range(n_iter): + num_items_to_select = rng.randint(0, max_items_per_iter) + item_idxs_to_select = rng.choice(len(all_items), num_items_to_select) + + selected_items = [all_items[idx] for idx in item_idxs_to_select] + selected_hashes = [all_hashes[idx] for idx in item_idxs_to_select] + + if cache_0_p0 is None: + cache_0_p0_out = selected_items + else: + for _ in range(is_cached_calls_per_iter): + cache_0_p0.is_cached(selected_hashes) + cache_0_p0_out = [ + item for item, _ in cache_0_p0.get_and_update( + [(item, prompt_update.content) for item in selected_items], + selected_hashes, + ) + ] + + if cache_1_p0 is None: + cache_1_p0_out = selected_items + else: + for _ in range(is_cached_calls_per_iter): + cache_1_p0.is_cached(selected_hashes) + cache_1_p0_out = [ + item for item, _ in cache_1_p0.get_and_update( + [(item, prompt_update.content) for item in selected_items], + selected_hashes, + ) + ] + + if cache_0_p1 is None: + cache_0_p1_out = cache_0_p0_out + else: + cache_0_p1_out = cache_0_p1.get_and_update(cache_0_p0_out, + selected_hashes) + + if cache_1_p1 is None: + cache_1_p1_out = cache_1_p0_out + else: + cache_1_p1_out = cache_1_p1.get_and_update(cache_1_p0_out, + selected_hashes) + + assert cache_0_p1_out == cache_1_p1_out, f"Failed at {it=}" + + +@pytest.mark.parametrize("is_cached_calls_per_iter", [1, 2, 3]) +def test_ipc_enable_disable_consistency(is_cached_calls_per_iter): + cache_size_gb = 1 / (1 << 20) + + vllm_config_ipc_enabled = _create_vllm_config( + mm_processor_cache_gb=cache_size_gb, + enable_ipc=True, + ) + vllm_config_ipc_disabled = _create_vllm_config( + mm_processor_cache_gb=0, + enable_ipc=False, + ) + vllm_config_cache_disabled = _create_vllm_config( + mm_processor_cache_gb=cache_size_gb, + enable_ipc=True, + ) + + _compare_caches( + vllm_config_ipc_enabled, + vllm_config_ipc_disabled, + is_cached_calls_per_iter=is_cached_calls_per_iter, + ) + _compare_caches( + vllm_config_ipc_disabled, + vllm_config_cache_disabled, + is_cached_calls_per_iter=is_cached_calls_per_iter, + ) + _compare_caches( + vllm_config_cache_disabled, + vllm_config_ipc_enabled, + is_cached_calls_per_iter=is_cached_calls_per_iter, + ) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index cd0e17977edec..ac6f51df95498 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -437,7 +437,7 @@ class ModelConfig: from `AutoProcessor.from_pretrained`. The available overrides depend on the model that is being run. For example, for Phi-3-Vision: `{"num_crops": 4}`. """ - mm_processor_cache_gb: int = 4 + mm_processor_cache_gb: float = 4 """The size (in GiB) of the multi-modal processor cache, which is used to avoid re-processing past multi-modal inputs. @@ -884,12 +884,6 @@ class ModelConfig: return None - def set_mm_processor_cache_gb(self, value: int) -> None: - mm_config = self.get_multimodal_config() - - self.mm_processor_cache_gb = value - mm_config.mm_processor_cache_gb = value - def _get_encoder_config(self): return get_sentence_transformer_tokenizer_config( self.model, self.revision) @@ -1697,22 +1691,6 @@ class ModelConfig: def is_multimodal_model(self) -> bool: return self.multimodal_config is not None - @property - def enable_mm_processor_cache(self) -> bool: - """Whether the multi-modal processor cache should be enabled.""" - mm_config = self.multimodal_config - if mm_config is None: - return False - - return mm_config.mm_processor_cache_gb > 0 - - def get_mm_input_cache_gb(self) -> int: - mm_config = self.multimodal_config - if mm_config is None: - return 0 - - return envs.VLLM_MM_INPUT_CACHE_GIB - @property def is_cross_encoder(self) -> bool: return (self._model_info.supports_cross_encoding @@ -2561,7 +2539,7 @@ class MultiModalConfig: `{"num_crops": 4}`. """ - mm_processor_cache_gb: int = 4 + mm_processor_cache_gb: float = 4 """ The size (in GiB) of the multi-modal processor cache, which is used to diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index f24c50ad73261..9e7c95ea5205f 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -351,7 +351,7 @@ class EngineArgs: mm_processor_kwargs: Optional[Dict[str, Any]] = \ MultiModalConfig.mm_processor_kwargs disable_mm_preprocessor_cache: bool = False # DEPRECATED - mm_processor_cache_gb: int = MultiModalConfig.mm_processor_cache_gb + mm_processor_cache_gb: float = MultiModalConfig.mm_processor_cache_gb mm_encoder_tp_mode: MMEncoderTPMode = MultiModalConfig.mm_encoder_tp_mode skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling # LoRA fields @@ -1293,18 +1293,6 @@ class EngineArgs: worker_extension_cls=self.worker_extension_cls, ) - if model_config.is_multimodal_model: - dp_supports_mm_processor_cache = (self.data_parallel_size == 1 - or data_parallel_external_lb) - if (not dp_supports_mm_processor_cache - and model_config.mm_processor_cache_gb > 0): - logger.warning( - "Multi-modal processor cache is disabled because " - "it is not compatible with data parallelism when " - "there does not exist a one-to-one correspondance " - "between API and engine core processes.") - model_config.set_mm_processor_cache_gb(0) - speculative_config = self.create_speculative_config( target_model_config=model_config, target_parallel_config=parallel_config, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index cbd714c159eb5..03c2f0375da42 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -36,6 +36,7 @@ from vllm.logits_process import get_bad_words_logits_processors from vllm.lora.request import LoRARequest from vllm.model_executor.layers.sampler import SamplerOutput from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry +from vllm.multimodal.cache import processor_only_cache_from_config from vllm.multimodal.processing import EncDecMultiModalProcessor from vllm.outputs import (PoolingRequestOutput, RequestOutput, RequestOutputFactory) @@ -250,9 +251,13 @@ class LLMEngine: self.generation_config_fields = ( self.model_config.try_get_generation_config()) - self.input_preprocessor = InputPreprocessor(self.model_config, - self.tokenizer, - mm_registry) + self.input_preprocessor = InputPreprocessor( + self.model_config, + self.tokenizer, + mm_registry, + mm_processor_cache=processor_only_cache_from_config( + self.model_config, mm_registry), + ) self.model_executor = executor_class(vllm_config=vllm_config) @@ -840,8 +845,8 @@ class LLMEngine: def reset_mm_cache(self) -> bool: """Reset the multi-modal cache.""" - return self.input_preprocessor.mm_registry.reset_processor_cache( - self.model_config) + self.input_preprocessor.clear_cache() + return True def reset_prefix_cache(self, device: Optional[Device] = None) -> bool: """Reset prefix cache for all devices.""" diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 3f521012e82a2..f0d0cab3df3d9 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -11,6 +11,7 @@ from vllm.config import ModelConfig from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry +from vllm.multimodal.cache import BaseMultiModalProcessorCache from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs, MultiModalInputs) from vllm.transformers_utils.tokenizer import AnyTokenizer @@ -32,12 +33,14 @@ class InputPreprocessor: model_config: ModelConfig, tokenizer: Optional[TokenizerGroup], mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, + mm_processor_cache: Optional[BaseMultiModalProcessorCache] = None, ) -> None: super().__init__() self.model_config = model_config self.tokenizer = tokenizer self.mm_registry = mm_registry + self.mm_processor_cache = mm_processor_cache def get_tokenizer_group(self) -> TokenizerGroup: if self.tokenizer is None: @@ -261,8 +264,11 @@ class InputPreprocessor: """ tokenizer = self._get_mm_tokenizer(lora_request) - mm_processor = self.mm_registry.create_processor(self.model_config, - tokenizer=tokenizer) + mm_processor = self.mm_registry.create_processor( + self.model_config, + tokenizer=tokenizer, + cache=self.mm_processor_cache, + ) if mm_processor_kwargs is None: mm_processor_kwargs = {} @@ -286,8 +292,12 @@ class InputPreprocessor: """ tokenizer = await self._get_mm_tokenizer_async(lora_request) - mm_processor = self.mm_registry.create_processor(self.model_config, - tokenizer=tokenizer) + mm_processor = self.mm_registry.create_processor( + self.model_config, + tokenizer=tokenizer, + cache=self.mm_processor_cache, + ) + if mm_processor_kwargs is None: mm_processor_kwargs = {} @@ -860,3 +870,7 @@ class InputPreprocessor: tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, ) + + def clear_cache(self) -> None: + if self.mm_processor_cache is not None: + self.mm_processor_cache.clear_cache() diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index ef146fdfbf97c..f0b392e9767ae 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -223,20 +223,26 @@ class InputRegistry: The model is identified by ``model_config``. """ # Avoid circular import + from vllm.multimodal.cache import processor_only_cache_from_config from vllm.sequence import SequenceData if not model_config.is_multimodal_model: seq_data = SequenceData.from_prompt_token_counts((0, seq_len)) return DummyData(seq_data=seq_data) + cache = processor_only_cache_from_config(model_config, mm_registry) + # Encoder dummy data does not contain multi-modal data if is_encoder_data: - enc_data = mm_registry.get_encoder_dummy_data( - model_config, seq_len) + enc_data = mm_registry.get_encoder_dummy_data(model_config, + seq_len, + cache=cache) seq_data = SequenceData.from_seqs(enc_data.prompt_token_ids) return DummyData(seq_data=seq_data) - dec_data = mm_registry.get_decoder_dummy_data(model_config, seq_len) + dec_data = mm_registry.get_decoder_dummy_data(model_config, + seq_len, + cache=cache) return DummyData( seq_data=SequenceData.from_seqs(dec_data.prompt_token_ids), diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py index eeb8291c77847..53f0585541b1c 100644 --- a/vllm/model_executor/models/hyperclovax_vision.py +++ b/vllm/model_executor/models/hyperclovax_vision.py @@ -33,12 +33,13 @@ from vllm.inputs import InputProcessingContext from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.cache import BaseMultiModalProcessorCache from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) from vllm.multimodal.parse import ImageSize, MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, - BaseProcessingInfo, ProcessingCache, - PromptReplacement, PromptUpdate) + BaseProcessingInfo, PromptReplacement, + PromptUpdate) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors @@ -367,7 +368,7 @@ def _build_hcxvision_hf_processor( info: HCXVisionProcessingInfo, dummy_inputs: BaseDummyInputsBuilder[HCXVisionProcessingInfo], *, - cache: Optional[ProcessingCache] = None, + cache: Optional[BaseMultiModalProcessorCache] = None, ) -> BaseMultiModalProcessor: if isinstance(info, HCXVisionProcessingInfo): return HCXVisionMultiModalProcessor( diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index bc53982c938ce..0ee26b68345c3 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -22,14 +22,14 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.cache import BaseMultiModalProcessorCache from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalInputs, MultiModalKwargsItems) from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, - BaseProcessingInfo, ProcessingCache, - PromptReplacement, PromptUpdate, - PromptUpdateDetails) + BaseProcessingInfo, PromptReplacement, + PromptUpdate, PromptUpdateDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.utils.jsontree import json_map_leaves @@ -394,7 +394,7 @@ def _build_llava_or_pixtral_hf_processor( info: _I, dummy_inputs: BaseDummyInputsBuilder[_I], *, - cache: Optional[ProcessingCache] = None, + cache: Optional[BaseMultiModalProcessorCache] = None, ) -> BaseMultiModalProcessor: if isinstance(info, PixtralHFProcessingInfo): return PixtralHFMultiModalProcessor( diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index a2a71bdd12b36..c22d871ab20d9 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -58,7 +58,8 @@ from vllm.multimodal.parse import (DictEmbeddingItems, ImageItem, VideoItem, VideoProcessorItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, - PromptUpdate, PromptUpdateDetails) + PromptUpdate, PromptUpdateDetails, + ResolvedPromptUpdate, _seq2text) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors @@ -744,6 +745,43 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]): for modality, pattern in placeholders ] + def _recompute_cached_prompt_update( + self, + cached_update: ResolvedPromptUpdate, + new_item_idx: int, + ) -> ResolvedPromptUpdate: + new_update = super()._recompute_cached_prompt_update( + cached_update, + new_item_idx, + ) + + if cached_update.modality == "image": + tokenizer = self.info.get_tokenizer() + image_processor = self.info.get_image_processor() + version = self.info.get_model_version() + + text = _seq2text(tokenizer, cached_update.content.full) + prev_item_idx = cached_update.item_idx + + if version == (2, 0) or version == (2, 5): + im_start = image_processor.im_start_token + im_end = image_processor.im_end_token + else: + im_start = image_processor.im_id_start + im_end = image_processor.im_id_end + + new_update = new_update.with_content( + PromptUpdateDetails.select_text( + text.replace( + f"{im_start}{prev_item_idx}{im_end}", + f"{im_start}{new_item_idx}{im_end}", + 1, + ), + "<unk>", + )) + + return new_update + def _get_mm_fields_config( self, hf_inputs: BatchFeature, diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index 438513433d3b2..08948960b275c 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -22,14 +22,14 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.cache import BaseMultiModalProcessorCache from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, - BaseProcessingInfo, ProcessingCache, - PromptReplacement, PromptUpdate, - PromptUpdateDetails) + BaseProcessingInfo, PromptReplacement, + PromptUpdate, PromptUpdateDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.utils.tensor_schema import TensorSchema, TensorShape @@ -322,7 +322,7 @@ def _build_mistral3_processor( info: _I, dummy_inputs: BaseDummyInputsBuilder[_I], *, - cache: Optional[ProcessingCache] = None, + cache: Optional[BaseMultiModalProcessorCache] = None, ) -> BaseMultiModalProcessor: assert isinstance(info, Mistral3ProcessingInfo) return Mistral3MultiModalProcessor( diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 61e09d56046cc..4522c7043d01a 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -41,7 +41,8 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, MultiModalPromptUpdates, PlaceholderFeaturesInfo, - PromptReplacement, PromptUpdate) + PromptReplacement, PromptUpdate, + ResolvedPromptUpdate) # yapf: enable from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors @@ -440,6 +441,23 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]): ) ] + def _recompute_cached_prompt_update( + self, + cached_update: ResolvedPromptUpdate, + new_item_idx: int, + ) -> ResolvedPromptUpdate: + new_update = super()._recompute_cached_prompt_update( + cached_update, + new_item_idx, + ) + + if cached_update.modality == "image": + hf_processor = self.info.get_hf_processor() + image_tokens: list[str] = hf_processor.img_tokens # type: ignore + new_update = new_update.with_target(image_tokens[new_item_idx]) + + return new_update + def _apply_prompt_updates( self, token_ids: list[int], diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 5129770e8d499..211cbd9c819cc 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -27,7 +27,7 @@ from vllm.multimodal.parse import (AudioProcessorItems, ImageEmbeddingItems, MultiModalDataItems, MultiModalDataParser) from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, - PromptUpdate) + PromptUpdate, ResolvedPromptUpdate) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of @@ -850,6 +850,25 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]): ), ] + def _recompute_cached_prompt_update( + self, + cached_update: ResolvedPromptUpdate, + new_item_idx: int, + ) -> ResolvedPromptUpdate: + new_update = super()._recompute_cached_prompt_update( + cached_update, + new_item_idx, + ) + + if cached_update.modality == "image": + image_tokens: list[str] = self.info.image_tokens # type: ignore + new_update = new_update.with_target(image_tokens[new_item_idx]) + elif cached_update.modality == "audio": + audio_tokens: list[str] = self.info.audio_tokens # type: ignore + new_update = new_update.with_target(audio_tokens[new_item_idx]) + + return new_update + @MULTIMODAL_REGISTRY.register_processor( Phi4MMMultiModalProcessor, diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py index 9b9cca8c6bd3c..c66867315e553 100644 --- a/vllm/model_executor/models/tarsier.py +++ b/vllm/model_executor/models/tarsier.py @@ -25,12 +25,13 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.models.llava import LlavaDummyInputsBuilder from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.cache import BaseMultiModalProcessorCache from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargsItems from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, - BaseProcessingInfo, ProcessingCache, - PromptReplacement, PromptUpdate) + BaseProcessingInfo, PromptReplacement, + PromptUpdate) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.utils.jsontree import json_map_leaves @@ -332,7 +333,7 @@ def _build_tarsier_hf_processor( info: _I_Tarsier, dummy_inputs: BaseDummyInputsBuilder[_I_Tarsier], *, - cache: Optional[ProcessingCache] = None, + cache: Optional[BaseMultiModalProcessorCache] = None, ) -> BaseMultiModalProcessor: if isinstance(info, TarsierProcessingInfo): return TarsierMultiModalProcessor( diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py index 5cec8e71fb265..0e81cb6d4d190 100644 --- a/vllm/multimodal/cache.py +++ b/vllm/multimodal/cache.py @@ -1,11 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import sys -from collections.abc import Mapping -from dataclasses import dataclass -from typing import TypeVar, Union +from abc import ABC, abstractmethod +from collections.abc import Mapping, Sequence +from typing import TYPE_CHECKING, Generic, Optional, TypeVar, Union import torch +from typing_extensions import TypeAlias, override from vllm.logger import init_logger from vllm.utils import GiB_bytes, LRUCache @@ -15,24 +16,67 @@ from .inputs import (MultiModalFieldElem, MultiModalKwargs, MultiModalKwargsItem, MultiModalKwargsItems, NestedTensors) +if TYPE_CHECKING: + from vllm.config import ModelConfig, VllmConfig + + from .processing import ResolvedPromptUpdate + from .registry import MultiModalRegistry + logger = init_logger(__name__) -@dataclass -class MultiModalCacheItemMetadata: - size: int +class MultiModalProcessorCacheItem: + """ + The data to store inside `MultiModalProcessorOnlyCache`. - @classmethod - def wraps(cls, value: "MultiModalCacheValue"): - return cls(size=MultiModalCache.get_item_size(value)) + Args: + item: The processed tensor data corresponding to a multi-modal item. + prompt_updates: The prompt updates corresponding to `item`. + """ + + def __init__( + self, + item: MultiModalKwargsItem, + prompt_updates: Sequence["ResolvedPromptUpdate"], + ) -> None: + super().__init__() + + self.item = item + self.prompt_updates = prompt_updates + + +class MultiModalProcessorCacheItemMetadata: + """ + The metadata to store inside `MultiModalProcessorSenderCache`. + + Args: + item: The processed tensor data corresponding to a multi-modal item. + Since P1 already stores the tensor data, we only store its size + metadata in P0 to reduce memory usage. The size metadata is still + needed to keep the same cache eviction policy as P0. + prompt_updates: The prompt updates corresponding to `item`. + This needs to stay on P0 because for some models, they are + dependent on the processed tensor data (cached on P1). + """ + + def __init__( + self, + item: MultiModalKwargsItem, + prompt_updates: Sequence["ResolvedPromptUpdate"], + ) -> None: + super().__init__() + + self.item_size = MultiModalCache.get_item_size(item) + self.prompt_updates = prompt_updates MultiModalCacheValue = Union[ + MultiModalProcessorCacheItem, + MultiModalProcessorCacheItemMetadata, MultiModalKwargsItems, MultiModalKwargsItem, MultiModalKwargs, Mapping[str, NestedTensors], - MultiModalCacheItemMetadata, ] _V = TypeVar("_V", bound=MultiModalCacheValue) @@ -47,8 +91,10 @@ class MultiModalCache: *, debug: bool = False, ) -> int: - if isinstance(leaf, MultiModalFieldElem): - return cls.get_item_size(leaf.data) # type: ignore + if isinstance(leaf, MultiModalProcessorCacheItem): + return cls.get_leaf_size(leaf.item) + if isinstance(leaf, MultiModalProcessorCacheItemMetadata): + return leaf.item_size # These are not subclasses of dict if isinstance(leaf, MultiModalKwargsItems): @@ -58,13 +104,13 @@ class MultiModalCache: if isinstance(leaf, MultiModalKwargs): return cls.get_item_size(leaf.data) # type: ignore + if isinstance(leaf, MultiModalFieldElem): + return cls.get_item_size(leaf.data) # type: ignore + # sys.getsizeof doesn't work for tensors if isinstance(leaf, torch.Tensor): return leaf.nbytes - if isinstance(leaf, MultiModalCacheItemMetadata): - return leaf.size - return sys.getsizeof(leaf) @classmethod @@ -98,3 +144,332 @@ class MultiModalCache: GiB_bytes * capacity_gb, getsizeof=lambda x: cls.get_item_size(x, debug=debug), ) + + +_I = TypeVar("_I", contravariant=True) +_O = TypeVar("_O", covariant=True) + + +class BaseMultiModalCache(ABC, Generic[_I, _O]): + """ + Abstract base class to read/write multi-modal items from cache. + + The idea of multi-modal caching is based on having a client and server + where the client executes in the frontend process (=P0) and + the server in the core process (=P1). The data flow is as follows: + + ``` + is_cached() x N get_and_update() + P0: From API -----------------> -----------------> To P1 + + get_and_update() + P1: From P0 -----------------> To model + ``` + + `is_cached()` can be called any number of times in P0. However, + `get_and_update()` must be called in P0 and P1 one after another + so that their cache eviction order remains the same. + + This ensures that the keys in P0 and P1 caches are mirrored, + allowing us to determine whether a key is cached in P1 by looking + up the P0 cache, without having to communicate with P1. + """ + + @abstractmethod + def get_and_update_item( + self, + mm_item: _I, + mm_hash: str, + ) -> _O: + """ + Possibly update a multi-modal item based on whether it is + in the underlying cache. + + This update is done out-of-place and updates the cache eviction order. + + Args: + mm_item: The multi-modal item to update. + mm_hash: The hash of `mm_item`. + + Returns: + The update multi-modal item. + """ + raise NotImplementedError + + def get_and_update( + self, + mm_items: Sequence[_I], + mm_hashes: list[str], + ) -> list[_O]: + """ + Possibly update a sequence of multi-modal items based on whether they + are in the underlying cache. + + This update is done out-of-place and updates the cache eviction order. + + Args: + mm_items: The multi-modal items to update. + mm_hashes: The hash of each item in `mm_items`. + + Returns: + A new list of updated multi-modal items. + """ + assert len(mm_items) == len(mm_hashes) + + return [ + self.get_and_update_item(mm_item, mm_hash) + for mm_item, mm_hash in zip(mm_items, mm_hashes) + ] + + @abstractmethod + def clear_cache(self) -> None: + """Clear the underlying cache.""" + raise NotImplementedError + + +MultiModalProcessorCacheInItem: TypeAlias = \ + Optional[tuple[MultiModalKwargsItem, Sequence["ResolvedPromptUpdate"]]] + + +MultiModalProcessorCacheOutItem: TypeAlias = \ + tuple[Optional[MultiModalKwargsItem], Sequence["ResolvedPromptUpdate"]] + + +class BaseMultiModalProcessorCache( + BaseMultiModalCache[MultiModalProcessorCacheInItem, + MultiModalProcessorCacheOutItem]): + """The required interface for caches on P0.""" + + @abstractmethod + def is_cached_item(self, mm_hash: str) -> bool: + """ + Check whether a multi-modal item is + in the underlying cache. + + This **DOES NOT** update the cache eviction order. + + Args: + mm_hash: The hash of the item to check. + + Returns: + `True` if the item is cached, otherwise `False`. + """ + raise NotImplementedError + + def is_cached(self, mm_hashes: list[str]) -> list[bool]: + """ + Check whether a sequence of multi-modal items are + in the underlying cache. + + This **DOES NOT** update the cache eviction order. + + Args: + mm_hashes: The hash of each item to check. + + Returns: + For each item, `True` if the item is cached, otherwise `False`. + """ + return [self.is_cached_item(mm_hash) for mm_hash in mm_hashes] + + +class MultiModalProcessorOnlyCache(BaseMultiModalProcessorCache): + """ + The cache which is used on P0 when IPC caching is disabled. + + How to update each item: + + - If the item is in the cache, replace the input with the cached item. + - If the item is not in the cache, store that item (which includes + tensor data and metadata) into the cache, and return the input. + """ + + def __init__(self, model_config: "ModelConfig") -> None: + super().__init__() + + mm_config = model_config.get_multimodal_config() + + self._cache = MultiModalCache.get_lru_cache( + mm_config.mm_processor_cache_gb, + MultiModalProcessorCacheItem, + ) + + @override + def is_cached_item(self, mm_hash: str) -> bool: + return mm_hash in self._cache + + @override + def get_and_update_item( + self, + mm_item: MultiModalProcessorCacheInItem, + mm_hash: str, + ) -> MultiModalProcessorCacheOutItem: + if (cached_item := self._cache.get(mm_hash)) is not None: + return cached_item.item, cached_item.prompt_updates + + assert mm_item is not None, f"Expected a cached item for {mm_hash=}" + + self._cache[mm_hash] = MultiModalProcessorCacheItem(*mm_item) + + return mm_item + + @override + def clear_cache(self) -> None: + self._cache.clear() + + +class MultiModalProcessorSenderCache(BaseMultiModalProcessorCache): + """ + The cache which is used on P0 when IPC caching is enabled. + + How to update each item: + + - If the item is already in the cache, clear the input to avoid + unnecessary IPC. + + - If the item is not in the cache, store the metadata of that item so + that the eviction policy remains the same as the cache on P1, + and return the input. + By only storing the metadata, we avoid keeping the data itself in + memory inside P0. + """ + + def __init__(self, model_config: "ModelConfig") -> None: + super().__init__() + + mm_config = model_config.get_multimodal_config() + + self._cache = MultiModalCache.get_lru_cache( + mm_config.mm_processor_cache_gb, + MultiModalProcessorCacheItemMetadata, + ) + + @override + def is_cached_item(self, mm_hash: str) -> bool: + return mm_hash in self._cache + + @override + def get_and_update_item( + self, + mm_item: MultiModalProcessorCacheInItem, + mm_hash: str, + ) -> MultiModalProcessorCacheOutItem: + if (cached_item := self._cache.get(mm_hash)) is not None: + return None, cached_item.prompt_updates + + assert mm_item is not None, f"Expected a cached item for {mm_hash=}" + + self._cache[mm_hash] = MultiModalProcessorCacheItemMetadata(*mm_item) + + return mm_item + + @override + def clear_cache(self) -> None: + self._cache.clear() + + +def _enable_processor_cache( + model_config: "ModelConfig", + mm_registry: "MultiModalRegistry", +) -> bool: + if not mm_registry.supports_multimodal_inputs(model_config): + return False + + mm_config = model_config.get_multimodal_config() + return mm_config.mm_processor_cache_gb > 0 + + +def _enable_ipc_cache(vllm_config: "VllmConfig") -> bool: + parallel_config = vllm_config.parallel_config + supports_ipc_cache = (parallel_config.data_parallel_size == 1 + or parallel_config.data_parallel_external_lb) + + return supports_ipc_cache + + +def processor_cache_from_config( + vllm_config: "VllmConfig", + mm_registry: "MultiModalRegistry", +) -> Optional[BaseMultiModalProcessorCache]: + """Return a `BaseMultiModalProcessorCache`, if enabled.""" + model_config = vllm_config.model_config + + if not _enable_processor_cache(model_config, mm_registry): + return None + + if not _enable_ipc_cache(vllm_config): + return MultiModalProcessorOnlyCache(model_config) + + return MultiModalProcessorSenderCache(model_config) + + +def processor_only_cache_from_config( + model_config: "ModelConfig", + mm_registry: "MultiModalRegistry", +): + """Return a `MultiModalProcessorOnlyCache`, if enabled.""" + if not _enable_processor_cache(model_config, mm_registry): + return None + + return MultiModalProcessorOnlyCache(model_config) + + +class BaseMultiModalReceiverCache( + BaseMultiModalCache[Optional[MultiModalKwargsItem], + MultiModalKwargsItem]): + """The required interface for caches on P1.""" + + +class MultiModalReceiverCache(BaseMultiModalReceiverCache): + """ + The cache which is used on P1 when IPC caching is enabled. + + How to update each item: + + - If the item is in the cache, replace the input with the cached item. + - If the item is not in the cache, store that item (which includes tensor + data) into the cache, and return the input. + """ + + def __init__(self, model_config: "ModelConfig") -> None: + super().__init__() + + mm_config = model_config.get_multimodal_config() + + self._cache = MultiModalCache.get_lru_cache( + mm_config.mm_processor_cache_gb, + MultiModalKwargsItem, + ) + + @override + def get_and_update_item( + self, + mm_item: Optional[MultiModalKwargsItem], + mm_hash: str, + ) -> MultiModalKwargsItem: + if (cached_item := self._cache.get(mm_hash)) is not None: + return cached_item + + assert mm_item is not None, f"Expected a cached item for {mm_hash=}" + + self._cache[mm_hash] = mm_item + return mm_item + + @override + def clear_cache(self) -> None: + self._cache.clear() + + +def receiver_cache_from_config( + vllm_config: "VllmConfig", + mm_registry: "MultiModalRegistry", +) -> Optional[BaseMultiModalReceiverCache]: + """Return a `BaseMultiModalReceiverCache`, if enabled.""" + model_config = vllm_config.model_config + + if not _enable_processor_cache(model_config, mm_registry): + return None + + if not _enable_ipc_cache(vllm_config): + return None + + return MultiModalReceiverCache(model_config) diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 581f9a109cce6..2c0ebaced67ef 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -7,11 +7,11 @@ from collections.abc import Mapping, Sequence from dataclasses import dataclass from functools import partial from itertools import accumulate -from typing import (TYPE_CHECKING, Any, Literal, Optional, TypedDict, TypeVar, - Union, cast, final) +from typing import (TYPE_CHECKING, Any, Literal, Optional, TypedDict, Union, + cast, final) import numpy as np -from typing_extensions import NotRequired, TypeAlias, deprecated +from typing_extensions import NotRequired, TypeAlias, TypeVar, deprecated from vllm.utils import LazyLoader, full_groupby, is_list_of from vllm.utils.jsontree import JSONTree, json_map_leaves @@ -668,7 +668,15 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]): return {key: elem.data for key, elem in self.items()} -class MultiModalKwargsItems(UserDict[str, Sequence[MultiModalKwargsItem]]): +_I = TypeVar( + "_I", + MultiModalKwargsItem, + Optional[MultiModalKwargsItem], + default=MultiModalKwargsItem, +) + + +class MultiModalKwargsItems(UserDict[str, Sequence[_I]]): """ A dictionary of [`MultiModalKwargsItem`][vllm.multimodal.inputs.MultiModalKwargsItem]s @@ -714,27 +722,37 @@ class MultiModalKwargsItems(UserDict[str, Sequence[MultiModalKwargsItem]]): items_by_modality = full_groupby(items, key=lambda x: x.modality) return MultiModalKwargsItems(items_by_modality) - def __getitem__(self, modality: str): + def __getitem__(self, modality: str) -> Sequence[_I]: if modality not in self: raise KeyError(f"Modality {modality!r} not found. " f"Available modalities: {set(self.keys())}") - return super().__getitem__(modality) + return super().__getitem__(modality) # type: ignore[return-value] def get_data(self, *, pin_memory: bool = False) -> "MultiModalKwargs": elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list) - for items in self.values(): - for item in items: + for modality, items in self.items(): + for i, item in enumerate(items): + if item is None: + raise RuntimeError("Cannot build data from empty " + f"mm_items[{modality}][{i}]") + for key, elem in item.items(): elems_by_key[key].append(elem) return MultiModalKwargs({ key: elems[0].field.reduce_data(elems, pin_memory=pin_memory) - for key, elems in elems_by_key.items() if len(elems) > 0 + for key, elems in elems_by_key.items() }) +MultiModalKwargsOptionalItems: TypeAlias = Union[ + MultiModalKwargsItems[MultiModalKwargsItem], + MultiModalKwargsItems[Optional[MultiModalKwargsItem]], +] + + class MultiModalKwargs(UserDict[str, NestedTensors]): """ A dictionary that represents the keyword arguments to @@ -898,7 +916,7 @@ class MultiModalInputs(TypedDict): token_type_ids: NotRequired[list[int]] """The token type IDs of the prompt.""" - mm_kwargs: MultiModalKwargsItems + mm_kwargs: MultiModalKwargsOptionalItems """Keyword arguments to be directly passed to the model after batching.""" mm_hashes: "MultiModalHashDict" diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 8c225e2a3c086..6ecdf80d4aa6f 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -4,7 +4,7 @@ from abc import ABC, abstractmethod from collections import defaultdict from collections.abc import (Callable, Generator, ItemsView, Iterable, Mapping, Sequence) -from dataclasses import dataclass, field +from dataclasses import dataclass, field, replace from enum import Enum from functools import lru_cache from typing import (TYPE_CHECKING, Generic, NamedTuple, Optional, Protocol, @@ -20,12 +20,11 @@ from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens, encode_tokens) from vllm.utils import flatten_2d_lists, full_groupby -from .cache import MultiModalCache from .hasher import MultiModalHasher from .inputs import (MultiModalDataDict, MultiModalEncDecInputs, MultiModalFieldConfig, MultiModalInputs, MultiModalKwargsItem, MultiModalKwargsItems, - PlaceholderRange) + MultiModalKwargsOptionalItems, PlaceholderRange) from .parse import (DictEmbeddingItems, EmbeddingItems, MultiModalDataItems, MultiModalDataParser) @@ -34,6 +33,7 @@ if TYPE_CHECKING: from transformers.feature_extraction_utils import BatchFeature from transformers.processing_utils import ProcessorMixin + from .cache import BaseMultiModalProcessorCache from .profiling import BaseDummyInputsBuilder logger = init_logger(__name__) @@ -557,6 +557,15 @@ class ResolvedPromptUpdate: return self.iter_token_matches(prompt, tokenizer, start_idx=start_idx) + def with_target(self, target: UpdateTarget): + return replace(self, target=target) + + def with_content(self, content: PromptUpdateInfo): + if not isinstance(content, PromptUpdateDetails): + content = PromptUpdateDetails.from_seq(content) + + return replace(self, content=content) + class _TokenMatch(NamedTuple): start_idx: int @@ -865,21 +874,6 @@ def find_mm_placeholders( return dict(full_groupby_modality(it)) -class ProcessingCache(MultiModalCache): - - def __init__(self, capacity_gb: float) -> None: - super().__init__() - - self._cache = self.get_lru_cache(capacity_gb, MultiModalKwargsItem) - - self.get = self._cache.get - self.put = self._cache.put - self.reset = self._cache.clear - - -_CacheItemOrHash = Union[MultiModalKwargsItem, str] - - class BaseProcessingInfo: """Base class to provide the information necessary for data processing.""" @@ -982,7 +976,7 @@ For an item `MultiModalPromptUpdates[k][i]`, class MultiModalProcessingInfo(NamedTuple): - kwargs: MultiModalKwargsItems + kwargs: MultiModalKwargsOptionalItems hashes: MultiModalHashes prompt_updates: MultiModalPromptUpdates @@ -994,11 +988,13 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): Not to be confused with `transformers.ProcessorMixin`. """ - def __init__(self, - info: _I, - dummy_inputs: "BaseDummyInputsBuilder[_I]", - *, - cache: Optional[ProcessingCache] = None) -> None: + def __init__( + self, + info: _I, + dummy_inputs: "BaseDummyInputsBuilder[_I]", + *, + cache: Optional["BaseMultiModalProcessorCache"] = None, + ) -> None: super().__init__() self.info = info @@ -1355,32 +1351,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): return prompt_ids, mm_processed_data, False - def _get_cache_missing_items( - self, - cache: ProcessingCache, - mm_data_items: MultiModalDataItems, - mm_hashes: MultiModalHashes, - ) -> tuple[dict[str, list[_CacheItemOrHash]], MultiModalDataItems]: - mm_cache_items_or_hashes: dict[str, list[_CacheItemOrHash]] = { - modality: [(h if (v := cache.get(h)) is None else v) - for h in hashes] - for modality, hashes in mm_hashes.items() - } - - mm_missing_idxs = { - modality: [ - idx for idx, item_or_hash in enumerate(items_or_hashes) - if isinstance(item_or_hash, str) - ] - for modality, items_or_hashes in mm_cache_items_or_hashes.items() - } - mm_missing_data = { - modality: [mm_data_items[modality][idx] for idx in idxs] - for modality, idxs in mm_missing_idxs.items() - } - - return mm_cache_items_or_hashes, self._to_mm_items(mm_missing_data) - def _hash_mm_items( self, mm_items: MultiModalDataItems, @@ -1401,28 +1371,92 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): for modality, items in mm_items.items() } + def _get_cache_missing_items( + self, + cache: "BaseMultiModalProcessorCache", + mm_data_items: MultiModalDataItems, + mm_hashes: MultiModalHashes, + ) -> MultiModalDataItems: + mm_is_cached = { + modality: cache.is_cached(hashes) + for modality, hashes in mm_hashes.items() + } + + mm_missing_idxs = { + modality: [ + idx for idx, item_is_cached in enumerate(items_is_cached) + if not item_is_cached + ] + for modality, items_is_cached in mm_is_cached.items() + } + mm_missing_data = { + modality: [mm_data_items[modality][idx] for idx in idxs] + for modality, idxs in mm_missing_idxs.items() + } + + return self._to_mm_items(mm_missing_data) + + def _recompute_cached_prompt_update( + self, + cached_update: ResolvedPromptUpdate, + new_item_idx: int, + ) -> ResolvedPromptUpdate: + """ + Override this if other attributes of `ResolvedPromptUpdate` + also need to be recomputed after retrieving from the cache. + """ + return replace(cached_update, item_idx=new_item_idx) + def _merge_mm_kwargs( self, - cache: ProcessingCache, - mm_cache_items_or_hashes: dict[str, list[_CacheItemOrHash]], + cache: "BaseMultiModalProcessorCache", + mm_hashes: MultiModalHashes, mm_missing_kwargs: MultiModalKwargsItems, - ) -> MultiModalKwargsItems: + mm_missing_prompt_updates: MultiModalPromptUpdates, + ) -> tuple[MultiModalKwargsOptionalItems, MultiModalPromptUpdates]: + # Need to calculate this at the beginning to avoid skipping cache logic + # for subsequently repeated items in the same modality + mm_is_cached = { + modality: cache.is_cached(hashes) + for modality, hashes in mm_hashes.items() + } + mm_missing_next_idx = defaultdict[str, int](lambda: 0) - merged_items = defaultdict[str, list[MultiModalKwargsItem]](list) - for modality, items_or_hashes in mm_cache_items_or_hashes.items(): - for item_or_hash in items_or_hashes: - if isinstance(item_or_hash, str): - kw_item = mm_missing_kwargs[modality][ - mm_missing_next_idx[modality]] - cache.put(item_or_hash, kw_item) + merged_kwargs = defaultdict[str, + list[Optional[MultiModalKwargsItem]]](list) + merged_prompt_updates = defaultdict[ + str, list[Sequence[ResolvedPromptUpdate]]](list) + for modality, hashes in mm_hashes.items(): + missing_kwargs = mm_missing_kwargs.get(modality, []) + missing_prompt_updates = mm_missing_prompt_updates.get( + modality, []) + + for item_idx, item_hash in enumerate(hashes): + kwargs: Optional[MultiModalKwargsItem] + if not mm_is_cached[modality][item_idx]: + missing_next_idx = mm_missing_next_idx[modality] + kwargs = missing_kwargs[missing_next_idx] + updates = missing_prompt_updates[missing_next_idx] + mm_missing_next_idx[modality] += 1 + + item = kwargs, updates else: - kw_item = item_or_hash + item = None - merged_items[modality].append(kw_item) + kwargs, updates = cache.get_and_update_item(item, item_hash) - return MultiModalKwargsItems(merged_items) + merged_kwargs[modality].append(kwargs) + merged_prompt_updates[modality].append([ + self._recompute_cached_prompt_update(update, item_idx) + for update in updates + ]) + + mm_kwargs = MultiModalKwargsItems(merged_kwargs) + mm_prompt_updates = dict(merged_prompt_updates) + + return mm_kwargs, mm_prompt_updates def _apply_hf_processor( self, @@ -1490,10 +1524,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): mm_hashes = self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs, tokenization_kwargs) - ( - mm_cache_items_or_hashes, - mm_missing_data_items, - ) = self._get_cache_missing_items( + + mm_missing_data_items = self._get_cache_missing_items( cache=cache, mm_data_items=mm_data_items, mm_hashes=mm_hashes, @@ -1520,16 +1552,17 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): hf_processor_mm_kwargs), ) - mm_kwargs = self._merge_mm_kwargs( - cache, - mm_cache_items_or_hashes=mm_cache_items_or_hashes, - mm_missing_kwargs=mm_missing_kwargs, + mm_missing_prompt_updates = self._get_mm_prompt_updates( + mm_missing_data_items, + hf_processor_mm_kwargs, + mm_missing_kwargs, ) - mm_prompt_updates = self._get_mm_prompt_updates( - mm_data_items, - hf_processor_mm_kwargs, - mm_kwargs, + mm_kwargs, mm_prompt_updates = self._merge_mm_kwargs( + cache, + mm_hashes=mm_hashes, + mm_missing_kwargs=mm_missing_kwargs, + mm_missing_prompt_updates=mm_missing_prompt_updates, ) mm_info = MultiModalProcessingInfo( @@ -1614,7 +1647,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): def _validate_mm_kwargs( self, - mm_kwargs: MultiModalKwargsItems, + mm_kwargs: MultiModalKwargsOptionalItems, mm_item_counts: Mapping[str, int], ) -> None: for modality, item_count in mm_item_counts.items(): @@ -1655,7 +1688,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): self, mm_items: MultiModalDataItems, prompt_ids: list[int], - mm_kwargs: MultiModalKwargsItems, + mm_kwargs: MultiModalKwargsOptionalItems, mm_prompt_updates: MultiModalPromptUpdates, is_update_applied: bool, ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]: diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index ea2efbdd8b524..ffc69a2db60a4 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -13,7 +13,7 @@ import vllm.envs as envs from vllm.logger import init_logger from .inputs import (MultiModalDataDict, MultiModalEncDecInputs, - MultiModalInputs, MultiModalKwargsItems, + MultiModalInputs, MultiModalKwargsOptionalItems, MultiModalPlaceholderDict) from .processing import (BaseMultiModalProcessor, BaseProcessingInfo, EncDecMultiModalProcessor) @@ -43,7 +43,7 @@ class DummyDecoderData(NamedTuple): """Dummy data used for profiling.""" prompt_token_ids: list[int] - multi_modal_data: MultiModalKwargsItems + multi_modal_data: MultiModalKwargsOptionalItems multi_modal_placeholders: MultiModalPlaceholderDict diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 8cd9e5604872a..38adbf8f3536a 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Mapping from dataclasses import dataclass -from functools import lru_cache from typing import TYPE_CHECKING, Generic, Optional, Protocol, TypeVar import torch.nn as nn @@ -13,8 +12,9 @@ from vllm.transformers_utils.tokenizer import (AnyTokenizer, cached_tokenizer_from_config) from vllm.utils import ClassRegistry -from .processing import (BaseMultiModalProcessor, BaseProcessingInfo, - ProcessingCache) +from .cache import (BaseMultiModalProcessorCache, + processor_only_cache_from_config) +from .processing import BaseMultiModalProcessor, BaseProcessingInfo from .profiling import (BaseDummyInputsBuilder, DummyDecoderData, DummyEncoderData, MultiModalProfiler) @@ -65,7 +65,7 @@ class MultiModalProcessorFactory(Protocol[_I]): info: _I, dummy_inputs: BaseDummyInputsBuilder[_I], *, - cache: Optional[ProcessingCache] = None, + cache: Optional[BaseMultiModalProcessorCache] = None, ) -> BaseMultiModalProcessor[_I]: ... @@ -80,20 +80,13 @@ class _ProcessorFactories(Generic[_I]): self, ctx: InputProcessingContext, *, - cache: Optional[ProcessingCache] = None, + cache: Optional[BaseMultiModalProcessorCache] = None, ): info = self.info(ctx) dummy_inputs_builder = self.dummy_inputs(info) return self.processor(info, dummy_inputs_builder, cache=cache) -# Make sure a different cache is used for each model config -# NOTE: ModelConfig is not hashable so it cannot be passed directly -@lru_cache(maxsize=1) -def _get_processor_cache(model_id: str, capacity_gb: int): - return ProcessingCache(capacity_gb) if capacity_gb > 0 else None - - class MultiModalRegistry: """ A registry that dispatches data processing according to the model. @@ -103,31 +96,6 @@ class MultiModalRegistry: self._processor_factories = ClassRegistry[nn.Module, _ProcessorFactories]() - def _get_processor_cache(self, model_config: "ModelConfig"): - model_id = model_config.model - capacity_gb = model_config.mm_processor_cache_gb - return _get_processor_cache(model_id, capacity_gb) - - def reset_processor_cache(self, model_config: "ModelConfig") -> bool: - """Reset the multi-modal processing cache.""" - if processor_cache := self._get_processor_cache(model_config): - processor_cache.reset() - - return True # Success - - def enable_mm_input_cache(self, model_config: "ModelConfig") -> bool: - """Whether the multi-modal input cache should be enabled. - NOTE: This is put under MultiModalRegistry on purpose to respect - text-only mode for multimodal models. - """ - - if not self.supports_multimodal_inputs(model_config): - return False - - mm_config = model_config.get_multimodal_config() - - return mm_config.mm_processor_cache_gb > 0 - def supports_multimodal_inputs(self, model_config: "ModelConfig") -> bool: """ Checks if the model supports multimodal inputs. @@ -157,6 +125,8 @@ class MultiModalRegistry: def get_max_tokens_per_item_by_modality( self, model_config: "ModelConfig", + *, + cache: Optional[BaseMultiModalProcessorCache] = None, ) -> Mapping[str, int]: """ Get the maximum number of tokens per data item from each modality based @@ -165,11 +135,11 @@ class MultiModalRegistry: if not model_config.is_multimodal_model: return {} - processor = self.create_processor(model_config, disable_cache=False) + processor = self.create_processor(model_config, cache=cache) profiler = MultiModalProfiler(processor) seq_len = model_config.max_model_len - mm_limits = self.get_mm_limits_per_prompt(model_config) + mm_limits = self.get_mm_limits_per_prompt(model_config, cache=cache) return profiler.get_mm_max_contiguous_tokens( seq_len, @@ -182,6 +152,8 @@ class MultiModalRegistry: def get_max_tokens_per_item_by_nonzero_modality( self, model_config: "ModelConfig", + *, + cache: Optional[BaseMultiModalProcessorCache] = None, ) -> Mapping[str, int]: """ Get the maximum number of tokens per data item from each modality based @@ -192,15 +164,19 @@ class MultiModalRegistry: This is currently directly used only in V1 for profiling the memory usage of a model. """ - mm_limits = self.get_mm_limits_per_prompt(model_config) + mm_limits = self.get_mm_limits_per_prompt(model_config, cache=cache) + max_tokens_per_item = self.get_max_tokens_per_item_by_modality( + model_config, + cache=cache, + ) return { key: max_tokens_per_mm_item - for key, max_tokens_per_mm_item in - self.get_max_tokens_per_item_by_modality(model_config).items() + for key, max_tokens_per_mm_item in max_tokens_per_item.items() if mm_limits[key] > 0 } + # TODO: Remove once V0 is gone def get_max_tokens_by_modality( self, model_config: "ModelConfig", @@ -209,14 +185,19 @@ class MultiModalRegistry: Get the maximum number of tokens from each modality for profiling the memory usage of a model. """ - mm_limits = self.get_mm_limits_per_prompt(model_config) + cache = processor_only_cache_from_config(model_config, self) + mm_limits = self.get_mm_limits_per_prompt(model_config, cache=cache) + max_tokens_per_item = self.get_max_tokens_per_item_by_modality( + model_config, + cache=cache, + ) return { key: mm_limits[key] * max_tokens_per_mm_item - for key, max_tokens_per_mm_item in - self.get_max_tokens_per_item_by_modality(model_config).items() + for key, max_tokens_per_mm_item in max_tokens_per_item.items() } + # TODO: Remove once V0 is gone def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int: """ Get the maximum number of multi-modal tokens @@ -227,6 +208,8 @@ class MultiModalRegistry: def get_mm_limits_per_prompt( self, model_config: "ModelConfig", + *, + cache: Optional[BaseMultiModalProcessorCache] = None, ) -> Mapping[str, int]: """ Get the maximum number of multi-modal input instances for each modality @@ -235,7 +218,7 @@ class MultiModalRegistry: if not model_config.is_multimodal_model: return {} - processor = self.create_processor(model_config, disable_cache=False) + processor = self.create_processor(model_config, cache=cache) profiler = MultiModalProfiler(processor) return profiler.get_mm_limits() @@ -303,7 +286,7 @@ class MultiModalRegistry: model_config: "ModelConfig", *, tokenizer: Optional[AnyTokenizer] = None, - disable_cache: Optional[bool] = None, + cache: Optional[BaseMultiModalProcessorCache] = None, ) -> BaseMultiModalProcessor[BaseProcessingInfo]: """ Create a multi-modal processor for a specific model and tokenizer. @@ -311,15 +294,10 @@ class MultiModalRegistry: if not model_config.is_multimodal_model: raise ValueError(f"{model_config.model} is not a multimodal model") - if disable_cache is None: - disable_cache = not model_config.enable_mm_processor_cache - model_cls = self._get_model_cls(model_config) factories = self._processor_factories[model_cls] ctx = self._create_processing_ctx(model_config, tokenizer) - cache = None if disable_cache else self._get_processor_cache( - model_config) return factories.build_processor(ctx, cache=cache) @@ -328,13 +306,15 @@ class MultiModalRegistry: model_config: "ModelConfig", seq_len: int, mm_counts: Optional[Mapping[str, int]] = None, + *, + cache: Optional[BaseMultiModalProcessorCache] = None, ) -> DummyDecoderData: """ Create dummy data for profiling the memory usage of a model. The model is identified by ``model_config``. """ - processor = self.create_processor(model_config, disable_cache=False) + processor = self.create_processor(model_config, cache=cache) profiler = MultiModalProfiler(processor) dummy_data = profiler.get_decoder_dummy_data(seq_len, mm_counts) @@ -352,13 +332,15 @@ class MultiModalRegistry: model_config: "ModelConfig", seq_len: int, mm_counts: Optional[Mapping[str, int]] = None, + *, + cache: Optional[BaseMultiModalProcessorCache] = None, ) -> DummyEncoderData: """ Create dummy data for profiling the memory usage of a model. The model is identified by ``model_config``. """ - processor = self.create_processor(model_config, disable_cache=False) + processor = self.create_processor(model_config, cache=cache) profiler = MultiModalProfiler(processor) dummy_data = profiler.get_encoder_dummy_data(seq_len, mm_counts) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 342d7b24f8e98..dbea0b610b31a 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -597,8 +597,7 @@ class AsyncLLM(EngineClient): await asyncio.gather(*coros) async def reset_mm_cache(self) -> None: - self.processor.mm_registry.reset_processor_cache(self.model_config) - self.processor.mm_input_cache_client.reset() + self.processor.clear_cache() await self.engine_core.reset_mm_cache_async() async def reset_prefix_cache(self, diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 32765cda6482f..b614828061846 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -22,6 +22,7 @@ from vllm.logger import init_logger from vllm.logging_utils.dump_input import dump_engine_exception from vllm.lora.request import LoRARequest from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.cache import receiver_cache_from_config from vllm.tasks import POOLING_TASKS, SupportedTask from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) @@ -38,7 +39,6 @@ from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest, EngineCoreRequestType, ReconfigureDistributedRequest, ReconfigureRankType, UtilityOutput, UtilityResult) -from vllm.v1.engine.mm_input_cache import MultiModalInputCacheServer from vllm.v1.engine.utils import EngineHandshakeMetadata, EngineZmqAddresses from vllm.v1.executor.abstract import Executor from vllm.v1.kv_cache_interface import KVCacheConfig @@ -128,8 +128,9 @@ class EngineCore: ) self.use_spec_decode = vllm_config.speculative_config is not None - self.mm_input_cache_server = MultiModalInputCacheServer( - vllm_config.model_config, MULTIMODAL_REGISTRY) + self.mm_registry = mm_registry = MULTIMODAL_REGISTRY + self.mm_receiver_cache = receiver_cache_from_config( + vllm_config, mm_registry) # Setup batch queue for pipeline parallelism. # Batch queue for scheduled batches. This enables us to asynchronously @@ -370,7 +371,8 @@ class EngineCore: logger.warning("Resetting the multi-modal cache when requests are " "in progress may lead to desynced internal caches.") - self.mm_input_cache_server.reset() + if self.mm_receiver_cache is not None: + self.mm_receiver_cache.clear_cache() def reset_prefix_cache(self): self.scheduler.reset_prefix_cache() @@ -435,10 +437,11 @@ class EngineCore: assert request.mm_kwargs is not None # Note on thread safety: no race condition. - # `mm_input_cache_server` is reset at the end of LLMEngine init, + # `mm_receiver_cache` is reset at the end of LLMEngine init, # and will only accessed in the input processing thread afterwards. - request.mm_kwargs = self.mm_input_cache_server.get_and_update( - request.mm_kwargs, request.mm_hashes) + if self.mm_receiver_cache is not None: + request.mm_kwargs = self.mm_receiver_cache.get_and_update( + request.mm_kwargs, request.mm_hashes) req = Request.from_engine_core_request(request, self.request_block_hasher) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 5a00a930951cc..7130f666ef19f 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -271,8 +271,7 @@ class LLMEngine: self.engine_core.profile(False) def reset_mm_cache(self): - self.processor.mm_registry.reset_processor_cache(self.model_config) - self.processor.mm_input_cache_client.reset() + self.processor.clear_cache() self.engine_core.reset_mm_cache() def reset_prefix_cache(self, device: Optional[Device] = None): diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py deleted file mode 100644 index aa7dc62fd4acb..0000000000000 --- a/vllm/v1/engine/mm_input_cache.py +++ /dev/null @@ -1,121 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Sequence -from typing import TYPE_CHECKING, Optional - -from vllm.multimodal import MultiModalRegistry -from vllm.multimodal.cache import MultiModalCache, MultiModalCacheItemMetadata -from vllm.multimodal.inputs import MultiModalKwargsItem -from vllm.utils import is_list_of - -if TYPE_CHECKING: - from vllm.config import ModelConfig - -# The idea of multimodal input caching is based on having a client and -# a server, where the client executes in the frontend process (=P0) and the -# server in the core process (=P1). -# -# -- P0: -# - BaseMultiModalProcessor calls MultiModalHasher to get the `mm_hash` of -# each input multi-modal item (e.g. image), -# - BaseMultiModalProcessor processes the input items into `mm_kwargs`, -# which are MultiModalKwargsItem instances that each correspond to an -# input multi-modal item. -# - MultiModalInputCacheClient accepts the `mm_kwargs` and corresponding -# `mm_hash` for each item. It stores the `mm_hash` as keys and the size -# of `mm_kwargs`, but not the `mm_kwargs` themselves, to avoid taking -# up additional memory in P0. -# - The `mm_hash` is always sent to P1. -# - The corresponding `mm_kwargs` are only sent to P1 if they are not cached -# in MultiModalInputCacheServer. -# -# -- P1: -# - If the `mm_hash` is cached (i.e. `mm_kwargs` are not sent from P0), -# MultiModalInputCacheServer retrieves the corresponding `mm_kwargs`. -# - If the `mm_hash` is not cached (i.e. `mm_kwargs` are sent from P0), -# MultiModalInputCacheServer stores `mm_kwargs` under the key `mm_hash`. -# - Either way, the `mm_hash` and corresponding `mm_kwargs` are sent to -# the engine for model execution. -# -# Both Client and Server must perform cache update and eviction based on the -# same item size. This ensures that the keys of MultiModalInputCacheClient -# and MultiModalInputCacheServer are mirrored, allowing us to determine in P0 -# whether a key is cached in MultiModalInputCacheServer by querying -# MultiModalInputCacheClient without having to communicate with P1. - - -class MultiModalInputCacheClient: - """Used by P0 to check whether multi-modal kwargs are cached in P1.""" - - def __init__(self, model_config: "ModelConfig", - mm_registry: MultiModalRegistry) -> None: - super().__init__() - - self.enabled = mm_registry.enable_mm_input_cache(model_config) - self.mm_cache = MultiModalCache.get_lru_cache( - model_config.get_mm_input_cache_gb(), - MultiModalCacheItemMetadata, - ) - - def get_and_update( - self, - mm_kwargs: Sequence[MultiModalKwargsItem], - mm_hashes: list[str], - ) -> list[Optional[MultiModalKwargsItem]]: - if not self.enabled: - return list(mm_kwargs) - - assert len(mm_kwargs) == len(mm_hashes) - - out_mm_items = list[Optional[MultiModalKwargsItem]]() - for mm_item, mm_hash in zip(mm_kwargs, mm_hashes): - if self.mm_cache.get(mm_hash) is not None: - out_mm_items.append(None) - else: - self.mm_cache[mm_hash] = \ - MultiModalCacheItemMetadata.wraps(mm_item) - out_mm_items.append(mm_item) - - return out_mm_items - - def reset(self) -> None: - self.mm_cache.clear() - - -class MultiModalInputCacheServer: - """Used by P1 to avoid requiring past multi-modal kwargs from P0.""" - - def __init__(self, model_config: "ModelConfig", - mm_registry: MultiModalRegistry) -> None: - super().__init__() - - self.enabled = mm_registry.enable_mm_input_cache(model_config) - self.mm_cache = MultiModalCache.get_lru_cache( - model_config.get_mm_input_cache_gb(), - MultiModalKwargsItem, - ) - - def get_and_update( - self, - mm_kwargs: Sequence[Optional[MultiModalKwargsItem]], - mm_hashes: list[str], - ) -> list[MultiModalKwargsItem]: - if not self.enabled: - mm_kwargs_lst = list(mm_kwargs) - assert is_list_of(mm_kwargs_lst, MultiModalKwargsItem) - return mm_kwargs_lst - - assert len(mm_kwargs) == len(mm_hashes) - - out_mm_items = list[MultiModalKwargsItem]() - for mm_item, mm_hash in zip(mm_kwargs, mm_hashes): - if mm_item is None: - out_mm_items.append(self.mm_cache[mm_hash]) - else: - self.mm_cache[mm_hash] = mm_item - out_mm_items.append(mm_item) - - return out_mm_items - - def reset(self) -> None: - self.mm_cache.clear() diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 300b0713b2ffe..7ed60156626bf 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -11,6 +11,7 @@ from vllm.inputs.parse import split_enc_dec_inputs from vllm.inputs.preprocess import InputPreprocessor from vllm.lora.request import LoRARequest from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry +from vllm.multimodal.cache import processor_cache_from_config from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange from vllm.multimodal.processing import EncDecMultiModalProcessor from vllm.multimodal.utils import argsort_mm_positions @@ -18,7 +19,6 @@ from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer_group import TokenizerGroup from vllm.v1.engine import EngineCoreRequest -from vllm.v1.engine.mm_input_cache import MultiModalInputCacheClient from vllm.v1.structured_output.backend_guidance import ( validate_guidance_grammar) from vllm.v1.structured_output.backend_lm_format_enforcer import ( @@ -47,16 +47,17 @@ class Processor: self.generation_config_fields = ( self.model_config.try_get_generation_config()) - self.input_preprocessor = InputPreprocessor(self.model_config, - self.tokenizer, - mm_registry) - self.mm_input_cache_client = MultiModalInputCacheClient( - self.model_config, mm_registry) + self.mm_registry = mm_registry + self.mm_processor_cache = processor_cache_from_config( + vllm_config, mm_registry) - @property - def mm_registry(self): - return self.input_preprocessor.mm_registry + self.input_preprocessor = InputPreprocessor( + self.model_config, + self.tokenizer, + mm_registry, + mm_processor_cache=self.mm_processor_cache, + ) def _validate_logprobs( self, @@ -310,7 +311,7 @@ class Processor: # in the input sequence. sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions) - orig_sorted_mm_inputs = [ + sorted_mm_inputs = [ decoder_mm_inputs[modality][idx] for modality, idx in sorted_mm_idxs ] @@ -323,11 +324,6 @@ class Processor: for modality, idx in sorted_mm_idxs ] - sorted_mm_inputs = self.mm_input_cache_client.get_and_update( - orig_sorted_mm_inputs, - sorted_mm_hashes, - ) - return decoder_inputs.get("prompt"), EngineCoreRequest( request_id=request_id, prompt_token_ids=decoder_inputs["prompt_token_ids"], @@ -415,3 +411,6 @@ class Processor: # TODO: Find out how many placeholder tokens are there so we can # check that chunked prefill does not truncate them # max_batch_len = self.scheduler_config.max_num_batched_tokens + + def clear_cache(self) -> None: + self.input_preprocessor.clear_cache() diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index f1ceaaae62a70..053aaf4f968e0 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2186,10 +2186,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): max_items_per_batch: int, ) -> BatchedTensorInputs: """Dummy data for profiling and precompiling multimodal models.""" + assert self.mm_budget is not None + dummy_decoder_data = self.mm_registry.get_decoder_dummy_data( model_config=self.model_config, seq_len=self.max_num_tokens, mm_counts={modality: 1}, + cache=self.mm_budget.cache, ) dummy_mm_data = dummy_decoder_data.multi_modal_data diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 4a485b7e077d4..d364236604274 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -1813,10 +1813,13 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): max_items_per_batch: int, ) -> BatchedTensorInputs: """Dummy data for profiling and precompiling multimodal models.""" + assert self.mm_budget is not None + dummy_decoder_data = self.mm_registry.get_decoder_dummy_data( model_config=self.model_config, seq_len=self.max_num_tokens, mm_counts={modality: 1}, + cache=self.mm_budget.cache, ) dummy_mm_data = dummy_decoder_data.multi_modal_data diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index b96473e7b1645..82ede5ad8eb1e 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -10,6 +10,7 @@ from vllm.attention.backends.abstract import AttentionBackend from vllm.config import ModelConfig, SchedulerConfig from vllm.model_executor.models.interfaces import MultiModalEmbeddings from vllm.model_executor.models.utils import extract_layer_index +from vllm.multimodal.cache import processor_only_cache_from_config from vllm.multimodal.registry import MultiModalRegistry from vllm.v1.attention.backends.utils import AttentionMetadataBuilder from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget @@ -33,14 +34,18 @@ class MultiModalBudget: self.model_config = model_config self.scheduler_config = scheduler_config self.mm_registry = mm_registry + self.cache = cache = processor_only_cache_from_config( + model_config, mm_registry) self.max_model_len = model_config.max_model_len self.max_num_reqs = scheduler_config.max_num_seqs - self.mm_limits = mm_registry.get_mm_limits_per_prompt(model_config) + self.mm_limits = mm_registry.get_mm_limits_per_prompt(model_config, + cache=cache) max_tokens_by_modality = mm_registry \ - .get_max_tokens_per_item_by_nonzero_modality(model_config) + .get_max_tokens_per_item_by_nonzero_modality(model_config, + cache=cache) encoder_compute_budget, encoder_cache_size = compute_mm_encoder_budget( scheduler_config, From 64466778397482e0cb9ff9f6b320ca6d9dc567ae Mon Sep 17 00:00:00 2001 From: Kunshang Ji <kunshang.ji@intel.com> Date: Wed, 27 Aug 2025 15:27:14 +0800 Subject: [PATCH 074/112] [XPU]fix cuda event used in XPU model runner (#23708) Signed-off-by: Kunshang Ji <kunshang.ji@intel.com> --- vllm/v1/worker/xpu_model_runner.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/xpu_model_runner.py b/vllm/v1/worker/xpu_model_runner.py index 59f8d0fcf5bd9..fb892211f19db 100644 --- a/vllm/v1/worker/xpu_model_runner.py +++ b/vllm/v1/worker/xpu_model_runner.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from contextlib import contextmanager from typing import TYPE_CHECKING import torch @@ -22,7 +23,8 @@ class XPUModelRunner(GPUModelRunner): vllm_config: VllmConfig, device: torch.device, ): - super().__init__(vllm_config, device) + with _torch_cuda_wrapper(): + super().__init__(vllm_config, device) # FIXME: To be verified. self.cascade_attn_enabled = False @@ -31,3 +33,21 @@ class XPUModelRunner(GPUModelRunner): def _sync_device(self) -> None: torch.xpu.synchronize() + + +@contextmanager +def _torch_cuda_wrapper(): + + class _EventPlaceholder: + + def __init__(self, *args, **kwargs) -> None: + self.record = lambda: None + self.synchronize = lambda: None + + try: + # replace cuda Event with xpu Event, this should work by default + torch.cuda.Event = torch.xpu.Event + yield + finally: + # if anything goes wrong, just patch it with a placeholder + torch.cuda.Event = _EventPlaceholder From 91e382c935c2905c29f3ca22c658e03e8f02deaa Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Wed, 27 Aug 2025 16:11:15 +0800 Subject: [PATCH 075/112] [CI/Build] Remove redundant register in model init tests (#23715) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- tests/models/test_initialization.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index bbd3da982af84..b4d516233b4bf 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -38,11 +38,6 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch, model_arch=model_arch, exist_overrides=model_info.hf_overrides) - if model_arch in ("Llama4ForCausalLM", "EagleLlama4ForCausalLM"): - from vllm.model_executor.models.llama4 import Llama4ForCausalLM - from vllm.model_executor.models.registry import ModelRegistry - ModelRegistry.register_model("Llama4ForCausalLM", Llama4ForCausalLM) - # Avoid calling model.forward() def _initialize_kv_caches_v0(self) -> None: self.cache_config.num_gpu_blocks = 0 From 5bd9f841581a3a9e9eecdd8764240575bb28e391 Mon Sep 17 00:00:00 2001 From: Michael Yao <haifeng.yao@daocloud.io> Date: Wed, 27 Aug 2025 17:50:09 +0800 Subject: [PATCH 076/112] [Docs] Fix an admonition important (#23726) Signed-off-by: windsonsea <haifeng.yao@daocloud.io> --- docs/configuration/optimization.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index 3eaf2185a559e..a8eab9985c8b9 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -164,7 +164,7 @@ llm = LLM( ) ``` -!! important +!!! important Batch-level DP is not to be confused with API request-level DP (which is instead controlled by `data_parallel_size`). From 6578e873655859462758c5c51e51f876f2aa24a3 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon <woosuk.kwon@berkeley.edu> Date: Wed, 27 Aug 2025 02:52:45 -0700 Subject: [PATCH 077/112] Optimize input preparation for FlashInfer [2/N] (#23174) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> --- vllm/v1/attention/backends/flashinfer.py | 82 ++++++++++++++++-------- 1 file changed, 55 insertions(+), 27 deletions(-) diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 941d2a4d7f1ac..f948157c2b575 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -6,6 +6,7 @@ from __future__ import annotations from dataclasses import dataclass from typing import ClassVar, Optional, Union +import numpy as np import torch from flashinfer import (BatchDecodeWithPagedKVCacheWrapper, BatchPrefillWithPagedKVCacheWrapper, @@ -22,6 +23,7 @@ from vllm.logger import init_logger from vllm.model_executor.layers.quantization.utils.quant_utils import ( QuantKey, kFp8StaticTensorSym, kNvfp4Quant) from vllm.platforms import current_platform +from vllm.triton_utils import tl, triton from vllm.utils import cdiv, is_pin_memory_available from vllm.utils.flashinfer import (supports_trtllm_attention, use_trtllm_attention) @@ -230,6 +232,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): dtype=torch.int32, device="cpu", pin_memory=pin_memory) + self.paged_kv_indptr_np = self.paged_kv_indptr_cpu.numpy() self.paged_kv_indices_cpu = torch.zeros(max_num_pages, dtype=torch.int32, device="cpu", @@ -238,10 +241,8 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): dtype=torch.int32, device="cpu", pin_memory=pin_memory) - - self.block_table_arange = torch.arange(max_num_pages_per_req, - dtype=torch.int32, - device=self.device) + self.paged_kv_last_page_len_np = ( + self.paged_kv_last_page_len_cpu.numpy()) def _get_workspace_buffer(self): if self._workspace_buffer is None: @@ -317,9 +318,10 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): max_seq_len = common_attn_metadata.max_seq_len seq_lens = common_attn_metadata.seq_lens seq_lens_cpu = common_attn_metadata.seq_lens_cpu + seq_lens_np = seq_lens_cpu.numpy() block_table_tensor = common_attn_metadata.block_table_tensor - block_table_bounds_cpu = (seq_lens_cpu + page_size - 1) // page_size + num_blocks_np = (seq_lens_np + (page_size - 1)) // page_size use_cascade = common_prefix_len > 0 if use_cascade: @@ -342,37 +344,41 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): # Remove the blocks of the shared prefix from all requests. block_table_tensor = block_table_tensor[:, num_common_kv_blocks:] - block_table_bounds_cpu -= num_common_kv_blocks + num_blocks_np -= num_common_kv_blocks else: shared_qo_indptr_cpu = None shared_kv_page_indptr_cpu = None shared_kv_page_indices_cpu = None shared_kv_last_page_len_cpu = None - max_num_blocks = block_table_bounds_cpu.max().item() - block_table_bounds = block_table_bounds_cpu.to(self.device, - non_blocking=True) - mask = (self.block_table_arange[:max_num_blocks].unsqueeze(0) - < block_table_bounds.unsqueeze(1)) - # write self.paged_kv_indices inplace - num_actual_pages = torch.sum(mask) - paged_kv_indices = self.paged_kv_indices[:num_actual_pages] - torch.masked_select(block_table_tensor[:, :max_num_blocks], - mask, - out=paged_kv_indices) - # write self.paged_kv_indptr_cpu inplace (0-index is always 0) - torch.cumsum(block_table_bounds_cpu, - dim=0, - dtype=torch.int32, - out=self.paged_kv_indptr_cpu[1:1 + num_reqs]) + np.cumsum( + num_blocks_np, + dtype=np.int32, + out=self.paged_kv_indptr_np[1:num_reqs + 1], + ) + paged_kv_indptr = self.paged_kv_indptr[:num_reqs + 1] + paged_kv_indptr.copy_(self.paged_kv_indptr_cpu[:num_reqs + 1], + non_blocking=True) + + # write self.paged_kv_indices inplace + num_actual_pages = num_blocks_np.sum().item() + paged_kv_indices = self.paged_kv_indices[:num_actual_pages] + _copy_page_indices_kernel[(num_reqs, )]( + paged_kv_indices, + block_table_tensor, + block_table_tensor.stride(0), + paged_kv_indptr, + BLOCK_SIZE=1024, + ) - paged_kv_last_page_len_cpu = seq_lens_cpu % page_size # write self.paged_kv_last_page_len_cpu inplace - torch.where(paged_kv_last_page_len_cpu == 0, - torch.tensor(page_size), - paged_kv_last_page_len_cpu, - out=self.paged_kv_last_page_len_cpu[:num_reqs]) + paged_kv_last_page_len_np = seq_lens_np % page_size + self.paged_kv_last_page_len_np[:num_reqs] = np.where( + paged_kv_last_page_len_np == 0, + page_size, + paged_kv_last_page_len_np, + ) # Check if any layer uses sinks (requires TRTLLM attention) has_sinks = self.global_hyperparameters.has_sinks @@ -1002,3 +1008,25 @@ def fast_plan_decode( self._sm_scale = sm_scale self._rope_scale = rope_scale self._rope_theta = rope_theta + + +@triton.jit +def _copy_page_indices_kernel( + page_indices, + block_table, + block_table_stride, + cu_num_blocks, + BLOCK_SIZE: tl.constexpr, +): + req_idx = tl.program_id(0) + row_ptr = block_table + req_idx * block_table_stride + start_idx = tl.load(cu_num_blocks + req_idx) + end_idx = tl.load(cu_num_blocks + req_idx + 1) + num_blocks = end_idx - start_idx + + offset = tl.arange(0, BLOCK_SIZE) + for i in tl.range(0, num_blocks, BLOCK_SIZE): + block_ids = tl.load(row_ptr + i + offset, mask=i + offset < num_blocks) + tl.store(page_indices + start_idx + i + offset, + block_ids, + mask=i + offset < num_blocks) From 04ff1e43fb6e2e675170d0c90399290f8925abb7 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon <woosuk.kwon@berkeley.edu> Date: Wed, 27 Aug 2025 03:25:00 -0700 Subject: [PATCH 078/112] [Misc] Move CpuGpuBuffer to vllm/v1/utils.py (#23728) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> --- vllm/v1/utils.py | 29 +++++++++++++++++++++++++++++ vllm/v1/worker/cpu_model_runner.py | 2 +- vllm/v1/worker/gpu_model_runner.py | 6 +++--- vllm/v1/worker/utils.py | 29 ----------------------------- 4 files changed, 33 insertions(+), 33 deletions(-) diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index b5750c82db023..8f9face6fbf2e 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -96,6 +96,35 @@ class ConstantList(Generic[T], Sequence): return f"ConstantList({self._x})" +class CpuGpuBuffer: + + def __init__( + self, + *args, + dtype: torch.dtype, + device: torch.device, + pin_memory: bool, + ): + self.cpu = torch.zeros(*args, + dtype=dtype, + device="cpu", + pin_memory=pin_memory) + self.np = self.cpu.numpy() + self.gpu = self.cpu.to(device) + + def copy_to_gpu(self, n: Optional[int] = None) -> torch.Tensor: + if n is None: + return self.gpu.copy_(self.cpu, non_blocking=True) + return self.gpu[:n].copy_(self.cpu[:n], non_blocking=True) + + def copy_to_cpu(self, n: Optional[int] = None) -> torch.Tensor: + """NOTE: Because this method is non-blocking, explicit synchronization + is needed to ensure the data is copied to CPU.""" + if n is None: + return self.cpu.copy_(self.gpu, non_blocking=True) + return self.cpu[:n].copy_(self.gpu[:n], non_blocking=True) + + def get_engine_client_zmq_addr(local_only: bool, host: str, port: int = 0) -> str: diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py index 137578f0e6088..742e553b77e09 100644 --- a/vllm/v1/worker/cpu_model_runner.py +++ b/vllm/v1/worker/cpu_model_runner.py @@ -10,8 +10,8 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.model_loader import get_model from vllm.v1.attention.backends.cpu_attn import TorchSDPAMetadataBuilderV1 +from vllm.v1.utils import CpuGpuBuffer from vllm.v1.worker.gpu_model_runner import GPUModelRunner -from vllm.v1.worker.utils import CpuGpuBuffer if TYPE_CHECKING: from vllm.v1.core.sched.output import SchedulerOutput diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 053aaf4f968e0..d93460d618e7c 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -78,14 +78,14 @@ from vllm.v1.spec_decode.eagle import EagleProposer from vllm.v1.spec_decode.medusa import MedusaProposer from vllm.v1.spec_decode.metadata import SpecDecodeMetadata from vllm.v1.spec_decode.ngram_proposer import NgramProposer +from vllm.v1.utils import CpuGpuBuffer from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch from vllm.v1.worker.kv_connector_model_runner_mixin import ( KVConnectorModelRunnerMixin, KVConnectorOutput) from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin -from .utils import (AttentionGroup, CpuGpuBuffer, MultiModalBudget, - bind_kv_cache, gather_mm_placeholders, - initialize_kv_cache_for_kv_sharing, +from .utils import (AttentionGroup, MultiModalBudget, bind_kv_cache, + gather_mm_placeholders, initialize_kv_cache_for_kv_sharing, sanity_check_mm_encoder_outputs, scatter_mm_placeholders) if TYPE_CHECKING: diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 82ede5ad8eb1e..f407534687662 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -303,32 +303,3 @@ def bind_kv_cache( for layer_name, kv_cache in kv_caches.items(): # NOTE: Use list because of v0 PP virtual engine. forward_context[layer_name].kv_cache = [kv_cache] - - -class CpuGpuBuffer: - - def __init__( - self, - *args, - dtype: torch.dtype, - device: torch.device, - pin_memory: bool, - ): - self.cpu = torch.zeros(*args, - dtype=dtype, - device="cpu", - pin_memory=pin_memory) - self.np = self.cpu.numpy() - self.gpu = self.cpu.to(device) - - def copy_to_gpu(self, n: Optional[int] = None) -> None: - if n is None: - return self.gpu.copy_(self.cpu, non_blocking=True) - return self.gpu[:n].copy_(self.cpu[:n], non_blocking=True) - - def copy_to_cpu(self, n: Optional[int] = None) -> None: - """NOTE: Because this method is non-blocking, explicit synchronization - is needed to ensure the data is copied to CPU.""" - if n is None: - return self.cpu.copy_(self.gpu, non_blocking=True) - return self.cpu[:n].copy_(self.gpu[:n], non_blocking=True) From 11eddf02f0234f79435d747f2d3dce117ab39aa1 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon <woosuk.kwon@berkeley.edu> Date: Wed, 27 Aug 2025 03:45:04 -0700 Subject: [PATCH 079/112] [FlashInfer] Cache hyper params in metadata builder (#23732) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> --- vllm/v1/attention/backends/flashinfer.py | 30 ++++++++++++------------ 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index f948157c2b575..1115fc606b055 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -214,6 +214,10 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): # TODO: discard this for trtllm-gen backend self.global_hyperparameters = infer_global_hyperparameters( get_per_layer_parameters(vllm_config, layer_names, FlashInferImpl)) + self.sm_scale = self.global_hyperparameters.sm_scale + self.window_left = self.global_hyperparameters.window_left + self.logits_soft_cap = self.global_hyperparameters.logits_soft_cap + self.has_sinks = self.global_hyperparameters.has_sinks # Preparing persistent buffers (device-side) self.paged_kv_indptr = torch.zeros(max_num_reqs + 1, @@ -381,8 +385,6 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): ) # Check if any layer uses sinks (requires TRTLLM attention) - has_sinks = self.global_hyperparameters.has_sinks - prefill_use_trtllm = use_trtllm_attention(self.num_qo_heads, self.num_kv_heads, num_prefill_tokens, @@ -390,7 +392,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): self.cache_dtype, self.q_data_type, is_prefill=True, - has_sinks=has_sinks) + has_sinks=self.has_sinks) decode_use_trtllm = use_trtllm_attention(self.num_qo_heads, self.num_kv_heads, num_decode_tokens, @@ -398,7 +400,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): self.cache_dtype, self.q_data_type, is_prefill=False, - has_sinks=has_sinks) + has_sinks=self.has_sinks) attn_metadata = FlashInferMetadata( num_actual_tokens=num_actual_tokens, @@ -433,9 +435,9 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): self.head_dim, self.page_size, causal=True, - sm_scale=self.global_hyperparameters.sm_scale, - window_left=self.global_hyperparameters.window_left, - logits_soft_cap=self.global_hyperparameters.logits_soft_cap, + sm_scale=self.sm_scale, + window_left=self.window_left, + logits_soft_cap=self.logits_soft_cap, q_data_type=self.q_data_type, kv_data_type=self.kv_cache_dtype, ) @@ -472,10 +474,9 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): self.head_dim, self.page_size, causal=True, - sm_scale=self.global_hyperparameters.sm_scale, - window_left=self.global_hyperparameters.window_left, - logits_soft_cap=self.global_hyperparameters. - logits_soft_cap, + sm_scale=self.sm_scale, + window_left=self.window_left, + logits_soft_cap=self.logits_soft_cap, q_data_type=self.q_data_type, kv_data_type=self.kv_cache_dtype, ) @@ -525,10 +526,9 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): self.page_size, # Disable flashinfer's pos encoding and use vllm's rope. pos_encoding_mode="NONE", - sm_scale=self.global_hyperparameters.sm_scale, - window_left=self.global_hyperparameters.window_left, - logits_soft_cap=self.global_hyperparameters. - logits_soft_cap, + sm_scale=self.sm_scale, + window_left=self.window_left, + logits_soft_cap=self.logits_soft_cap, q_data_type=self.q_data_type, kv_data_type=self.kv_cache_dtype, ) From e03940762b43812fccd3c214bda60201cff9d16a Mon Sep 17 00:00:00 2001 From: Jee Jee Li <pandaleefree@gmail.com> Date: Wed, 27 Aug 2025 18:59:35 +0800 Subject: [PATCH 080/112] [CI/Build] Reduce LoRA layer test cases (#23721) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> --- tests/lora/test_layers.py | 72 ++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 39 deletions(-) diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 92db023babc28..6e2dda464d8eb 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -243,7 +243,7 @@ def check_punica_wrapper(punica_wrapper) -> bool: @torch.inference_mode() -@pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) +@pytest.mark.parametrize("num_loras", [1, 2, 4]) @pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000]) @pytest.mark.parametrize("stage", STAGES) @@ -347,7 +347,7 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None: @torch.inference_mode() # @pytest.mark.skip( # reason="Fails when loras are in any slot other than the first.") -@pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) +@pytest.mark.parametrize("num_loras", [1, 2, 4]) @pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000]) @pytest.mark.parametrize("stage", STAGES) @@ -486,7 +486,7 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device, @torch.inference_mode() -@pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) +@pytest.mark.parametrize("num_loras", [1, 2, 4]) @pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 256512]) @pytest.mark.parametrize("stage", STAGES) @@ -620,12 +620,15 @@ def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size, @torch.inference_mode() -@pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) +@pytest.mark.parametrize("num_loras", [1, 2, 4]) @pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("stage", STAGES) -@pytest.mark.parametrize("bias_enabled", [True, False]) -def test_linear_replicated(dist_init, num_loras, device, stage, - bias_enabled) -> None: +def test_linear_replicated( + dist_init, + num_loras, + device, + stage, +) -> None: if current_platform.is_cuda_alike(): torch.cuda.set_device(device) @@ -634,10 +637,11 @@ def test_linear_replicated(dist_init, num_loras, device, stage, torch.set_default_device(device) punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras) assert check_punica_wrapper(punica_wrapper) - lora_config = LoRAConfig(max_loras=max_loras, - max_lora_rank=8, - lora_dtype=torch.float16, - bias_enabled=bias_enabled) + lora_config = LoRAConfig( + max_loras=max_loras, + max_lora_rank=8, + lora_dtype=torch.float16, + ) def create_random_linear_replicated_layer(): @@ -651,10 +655,6 @@ def test_linear_replicated(dist_init, num_loras, device, stage, lora_linear.create_lora_weights(max_loras, lora_config) assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len( lora_linear.lora_b_stacked) == 1) - if bias_enabled: - assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices - else: - assert lora_linear.lora_bias_stacked is None return linear, lora_linear for i in range(NUM_RANDOM_SEEDS): @@ -734,14 +734,13 @@ def test_linear_replicated(dist_init, num_loras, device, stage, @torch.inference_mode() -@pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) +@pytest.mark.parametrize("num_loras", [1, 2, 4]) @pytest.mark.parametrize("orientation", ["row", "column"]) @pytest.mark.parametrize("fully_shard", [True, False]) @pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("stage", STAGES) -@pytest.mark.parametrize("bias_enabled", [True, False]) def test_linear_parallel(dist_init, num_loras, orientation, fully_shard, - device, stage, bias_enabled) -> None: + device, stage) -> None: if current_platform.is_cuda_alike(): torch.cuda.set_device(device) @@ -750,11 +749,12 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard, torch.set_default_device(device) punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras) assert check_punica_wrapper(punica_wrapper) - lora_config = LoRAConfig(max_loras=max_loras, - max_lora_rank=8, - fully_sharded_loras=fully_shard, - lora_dtype=torch.float16, - bias_enabled=bias_enabled) + lora_config = LoRAConfig( + max_loras=max_loras, + max_lora_rank=8, + fully_sharded_loras=fully_shard, + lora_dtype=torch.float16, + ) def create_random_linear_parallel_layer(): if orientation == "row": @@ -777,10 +777,7 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard, lora_linear.create_lora_weights(max_loras, lora_config) assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len( lora_linear.lora_b_stacked) == 1) - if bias_enabled: - assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices - else: - assert lora_linear.lora_bias_stacked is None + return linear, lora_linear for i in range(NUM_RANDOM_SEEDS): @@ -860,14 +857,13 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard, @torch.inference_mode() -@pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) +@pytest.mark.parametrize("num_loras", [1, 2, 4]) @pytest.mark.parametrize("repeats", [1, 2, 3]) @pytest.mark.parametrize("fully_shard", [True, False]) @pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("stage", STAGES) -@pytest.mark.parametrize("bias_enabled", [True, False]) def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard, - device, stage, bias_enabled) -> None: + device, stage) -> None: if current_platform.is_cuda_alike(): torch.cuda.set_device(device) @@ -876,11 +872,12 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard, torch.set_default_device(device) punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras) assert check_punica_wrapper(punica_wrapper) - lora_config = LoRAConfig(max_loras=max_loras, - max_lora_rank=8, - fully_sharded_loras=fully_shard, - lora_dtype=torch.float16, - bias_enabled=bias_enabled) + lora_config = LoRAConfig( + max_loras=max_loras, + max_lora_rank=8, + fully_sharded_loras=fully_shard, + lora_dtype=torch.float16, + ) def create_column_parallel_packed_layer(): if repeats == 2: @@ -924,10 +921,7 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard, model_config=FakeConfig()) assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len( lora_linear.lora_b_stacked) == n_slices) - if bias_enabled: - assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices - else: - assert lora_linear.lora_bias_stacked is None + return linear, lora_linear for i in range(NUM_RANDOM_SEEDS): From 8f0d7eaea87409a54ccaed76995b59c6b0a3d4cf Mon Sep 17 00:00:00 2001 From: Fanli Lin <fanli0116@gmail.com> Date: Wed, 27 Aug 2025 19:57:38 +0800 Subject: [PATCH 081/112] [XPU] Fix OOM issue for data parallel with Ray backend (#22500) Signed-off-by: Fanli Lin <fanli.lin@intel.com> Signed-off-by: Fanli Lin <fanli0116@gmail.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> --- vllm/v1/engine/core.py | 27 ++++++++++++++++++--------- vllm/v1/engine/utils.py | 35 +++++++++++++++++++++++++++++++---- 2 files changed, 49 insertions(+), 13 deletions(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index b614828061846..a7038e2d2c264 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -39,7 +39,8 @@ from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest, EngineCoreRequestType, ReconfigureDistributedRequest, ReconfigureRankType, UtilityOutput, UtilityResult) -from vllm.v1.engine.utils import EngineHandshakeMetadata, EngineZmqAddresses +from vllm.v1.engine.utils import (EngineHandshakeMetadata, EngineZmqAddresses, + get_device_indices) from vllm.v1.executor.abstract import Executor from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.metrics.stats import SchedulerStats @@ -1169,22 +1170,30 @@ class DPEngineCoreActor(DPEngineCoreProc): # https://github.com/ray-project/ray/pull/40461/files#diff-31e8159767361e4bc259b6d9883d9c0d5e5db780fcea4a52ead4ee3ee4a59a78R1860 # noqa: E501 # and get_accelerator_ids_for_accelerator_resource() in worker.py # of ray. - self._set_cuda_visible_devices(vllm_config, local_dp_rank) + self._set_visible_devices(vllm_config, local_dp_rank) super().__init__(vllm_config, local_client, "", executor_class, log_stats) - def _set_cuda_visible_devices(self, vllm_config: VllmConfig, - local_dp_rank: int): + def _set_visible_devices(self, vllm_config: VllmConfig, + local_dp_rank: int): from vllm.platforms import current_platform - device_control_env_var = current_platform.device_control_env_var + if current_platform.is_xpu(): + pass + else: + device_control_env_var = current_platform.device_control_env_var + self._set_cuda_visible_devices(vllm_config, local_dp_rank, + device_control_env_var) + + def _set_cuda_visible_devices(self, vllm_config: VllmConfig, + local_dp_rank: int, + device_control_env_var: str): world_size = vllm_config.parallel_config.world_size # Set CUDA_VISIBLE_DEVICES or equivalent. try: - os.environ[device_control_env_var] = ",".join( - str(current_platform.device_id_to_physical_device_id(i)) - for i in range(local_dp_rank * - world_size, (local_dp_rank + 1) * world_size)) + value = get_device_indices(device_control_env_var, local_dp_rank, + world_size) + os.environ[device_control_env_var] = value except IndexError as e: raise Exception( f"Error setting {device_control_env_var}: " diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py index 62f229e286931..56ef8477d267a 100644 --- a/vllm/v1/engine/utils.py +++ b/vllm/v1/engine/utils.py @@ -164,19 +164,33 @@ def set_device_control_env_var(vllm_config: VllmConfig, """ world_size = vllm_config.parallel_config.world_size evar = current_platform.device_control_env_var + + value = get_device_indices(evar, local_dp_rank, world_size) + with patch.dict(os.environ, values=((evar, value), )): + yield + + +def get_device_indices(device_control_env_var: str, local_dp_rank: int, + world_size: int): + """ + Returns a comma-separated string of device indices for the specified + data parallel rank. + + For example, if world_size=2 and local_dp_rank=1, and there are 4 devices, + this will select devices 2 and 3 for local_dp_rank=1. + """ try: value = ",".join( str(current_platform.device_id_to_physical_device_id(i)) for i in range(local_dp_rank * world_size, (local_dp_rank + 1) * world_size)) except IndexError as e: - raise Exception(f"Error setting {evar}: " + raise Exception(f"Error setting {device_control_env_var}: " f"local range: [{local_dp_rank * world_size}, " f"{(local_dp_rank + 1) * world_size}) " "base value: " - f"\"{os.getenv(evar)}\"") from e - with patch.dict(os.environ, values=((evar, value), )): - yield + f"\"{os.getenv(device_control_env_var)}\"") from e + return value class CoreEngineActorManager: @@ -254,6 +268,19 @@ class CoreEngineActorManager: dp_vllm_config = copy.deepcopy(vllm_config) dp_vllm_config.parallel_config.placement_group = pg local_client = index < local_engine_count + + # Ray XPU known issue: dpctl initializes the GPU runtime early, so + # setting device env vars in Ray actor's initialization method + # will not affect device selection. See: + # https://github.com/ray-project/ray/blob/master/python/ray/_private/accelerators/intel_gpu.py#L56 # noqa: E501 + if current_platform.is_xpu(): + device_evar = current_platform.device_control_env_var + device_indices = get_device_indices(device_evar, local_index, + world_size) + actor_env_vars = self.env_vars_dict.copy() + actor_env_vars[device_evar] = device_indices + runtime_env = RuntimeEnv(env_vars=actor_env_vars) + actor = ray.remote(DPEngineCoreActor).options( scheduling_strategy=PlacementGroupSchedulingStrategy( placement_group=pg, From 1f7a9c95e4b2a1e02b19e94fd7371443f08b2e4b Mon Sep 17 00:00:00 2001 From: Michael Yao <haifeng.yao@daocloud.io> Date: Wed, 27 Aug 2025 20:37:52 +0800 Subject: [PATCH 082/112] [Docs] Fix a 1-2-3 list and style issues in tpu.md (#23729) Signed-off-by: windsonsea <haifeng.yao@daocloud.io> --- docs/configuration/tpu.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/configuration/tpu.md b/docs/configuration/tpu.md index ac2b6baffd14e..e456077e04958 100644 --- a/docs/configuration/tpu.md +++ b/docs/configuration/tpu.md @@ -45,30 +45,30 @@ This initial compilation time ranges significantly and is impacted by many of th ### Optimize based on your data -#### max model len vs. most model len +#### max-model-len vs. most-model-len ![most_model_len](../assets/design/tpu/most_model_len.png) -If most of your requests are shorter than the maximum model length but you still need to accommodate occasional longer requests, setting a high maximum model length can negatively impact performance. In these cases, you can try introducing most model len by specifying the `VLLM_TPU_MOST_MODEL_LEN` environment variable. +If most of your requests are shorter than the maximum model length but you still need to accommodate occasional longer requests, setting a high maximum model length can negatively impact performance. In these cases, you can try introducing most-model-len by specifying the `VLLM_TPU_MOST_MODEL_LEN` environment variable. For example, 1% requests are 32k length and 99% requests are 2k length. You can pass 32k into `--max-model-len 32768` and use `VLLM_TPU_MOST_MODEL_LEN=2048`. -The requests get subdivided into max-model-len and most-model-len categories, for the latter category, we can gain better performance since the server can process more requests at a time. +The requests get subdivided into max-model-len and most-model-len categories, for the latter category, you can gain better performance since the server can process more requests at a time. #### Padding -For online serving with latency requirements, consider switching to bucket padding by setting the `VLLM_TPU_BUCKET_PADDING_GAP` environment variable. Because of the layout of the TPU, try using increments of 128: 128, 256, etc. +For online serving with latency requirements, consider switching to bucket padding by setting the `VLLM_TPU_BUCKET_PADDING_GAP` environment variable. Because of the layout of the TPU, try using increments of 128 (e.g., 128, 256, etc.) -The server pads the requests into fixed lengths before sending them to the model to avoid recompilation. To read more about tpu padding, see [here](https://cloud.google.com/tpu/docs/performance-guide#xla-efficiencies). Currently, there are 2 ways to pad the requests: +The server pads the requests into fixed lengths before sending them to the model to avoid recompilation. To read more about TPU padding, see [here](https://cloud.google.com/tpu/docs/performance-guide#xla-efficiencies). Currently, there are 2 ways to pad the requests: -1) the default exponential padding (pad to the nearest power of 2) -2) bucket padding (pad to the nearest linearly increasing bucket). +1. the default exponential padding (pad to the nearest power of 2) +2. bucket padding (pad to the nearest linearly increasing bucket). When using bucket padding, the buckets start from 16, end at max_model_len, and increment by `VLLM_TPU_BUCKET_PADDING_GAP`. For example, max_model_len=512, padding_gap=64, the buckets will be [16, 32, 64, 128, 192, 256, 320, 384, 448, 512]. -The fewer tokens we pad, the less unnecessary computation TPU does, the better performance we can get. For example, if num_tokens=300, with exponential padding, we pad to 512, with the bucket_padding above, we pad to 320. +The fewer tokens you pad, the less unnecessary computation TPU does, the better performance you can get. For example, if num_tokens=300, with exponential padding, you pad to 512, with the bucket_padding above, you pad to 320. However, you need to be careful to choose the padding gap. If the gap is too small, it means the number of buckets is large, leading to increased warmup (precompile) time and higher memory to store the compiled graph. Too many compiled graphs may lead to HBM OOM. Conversely, an overly large gap yields no performance improvement compared to the default exponential padding. From 9d30de44698e1e337e4736ff62b83ebe1bbd4d40 Mon Sep 17 00:00:00 2001 From: tc-mb <157115220+tc-mb@users.noreply.github.com> Date: Wed, 27 Aug 2025 20:38:00 +0800 Subject: [PATCH 083/112] [model] Support MiniCPM-V 4.5 (#23586) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: tc-mb <caitianchi@modelbest.cn> Signed-off-by: Xin Yang <xyangx@amazon.com> Signed-off-by: Abatom <abzhonghua@gmail.com> Signed-off-by: chzhang <chaojun.zhang@intel.com> Signed-off-by: Pate Motter <patemotter@google.com> Signed-off-by: Terrencezzj <terrence@cohere.ai> Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai> Signed-off-by: simon-mo <simon.mo@hey.com> Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: Siyuan Fu <siyuanf@nvidia.com> Signed-off-by: siyuanf <siyuanf@nvidia.com> Signed-off-by: Weiliang Liu <weiliangl@nvidia.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Signed-off-by: yewentao256 <zhyanwentao@126.com> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Signed-off-by: Zijing Liu <liuzijing2014@gmail.com> Signed-off-by: Zijing Liu <liuzijing2014@users.noreply.github.com> Signed-off-by: jiabin.00 <jiabin.00@bytedance.com> Signed-off-by: zjy0516 <riverclouds.zhu@qq.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Signed-off-by: tc-mb <157115220+tc-mb@users.noreply.github.com> Signed-off-by: Roger Wang <hey@rogerw.me> Signed-off-by: Roger Wang <hey@rogerw.io> Signed-off-by: Huy Do <huydhn@gmail.com> Signed-off-by: Matúš Námešný <matus.namesny@ameria.com> Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com> Signed-off-by: Chen Zhang <zhangch99@outlook.com> Signed-off-by: oye93 <en.ouyang93@outlook.com> Signed-off-by: Julien Lin <jullin@nvidia.com> Signed-off-by: Didier Durand <durand.didier@gmail.com> Signed-off-by: Tianyu Li <tianyu.li@arm.com> Signed-off-by: Hongxia Yang <hongxia.yang@amd.com> Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com> Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com> Signed-off-by: jiang1.li <jiang1.li@intel.com> Signed-off-by: Zerohertz <ohg3417@gmail.com> Signed-off-by: Hyogeun Oh (오효근) <ohg3417@gmail.com> Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> Signed-off-by: Russell Bryant <rbryant@redhat.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: Huzaifa Sidhpurwala <huzaifas@redhat.com> Signed-off-by: Federico <65908512+coval3nte@users.noreply.github.com> Signed-off-by: Zixuan Zhang <zixuanzhang@bytedance.com> Signed-off-by: wuhang <wuhang6@huawei.com> Signed-off-by: czhu-cohere <conway.zhu@cohere.com> Signed-off-by: Wei Wei <wwei6@meta.com> Signed-off-by: Yiheng Xu <charlesyihengxu@gmail.com> Signed-off-by: Chenheli Hua <huachenheli@outlook.com> Signed-off-by: wangyafeng <wangyafeng@baidu.com> Co-authored-by: Xin Yang <105740670+xyang16@users.noreply.github.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: Zhonghua Deng <abzhonghua@gmail.com> Co-authored-by: Chaojun Zhang <chaojun.zhang@intel.com> Co-authored-by: Pate Motter <p@temotter.com> Co-authored-by: Terrence Zhao <32208165+Terrencezzj@users.noreply.github.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Co-authored-by: Simon Mo <simon.mo@hey.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: weiliang <weiliangl@nvidia.com> Co-authored-by: Siyuan Fu <siyuanf@nvidia.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk> Co-authored-by: Copilot <198982749+Copilot@users.noreply.github.com> Co-authored-by: ProExpertProg <11367180+ProExpertProg@users.noreply.github.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: Zijing Liu <liuzijing2014@users.noreply.github.com> Co-authored-by: Bin Jia <45593998+FoolPlayer@users.noreply.github.com> Co-authored-by: Jiangyun Zhu <riverclouds.zhu@qq.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Raghavan <oneraghavan@gmail.com> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Roger Wang <hey@rogerw.io> Co-authored-by: Roger Wang <hey@rogerw.me> Co-authored-by: knlnguyen1802 <knlnguyen1802@gmail.com> Co-authored-by: Huy Do <huydhn@gmail.com> Co-authored-by: Matúš Námešný <matus@namesny.com> Co-authored-by: Guillaume Calmettes <gcalmettes@scaleway.com> Co-authored-by: Chen Zhang <zhangch99@outlook.com> Co-authored-by: En Ouyang <en.ouyang93@outlook.com> Co-authored-by: Li, Jiang <jiang1.li@intel.com> Co-authored-by: nvjullin <jullin@nvidia.com> Co-authored-by: Didier Durand <2927957+didier-durand@users.noreply.github.com> Co-authored-by: TianyuLi0 <116711075+TianyuLi0@users.noreply.github.com> Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com> Co-authored-by: Yuekai Zhang <zhangyuekai@foxmail.com> Co-authored-by: vllmellm <vllm.ellm@embeddedllm.com> Co-authored-by: Hyogeun Oh (오효근) <ohg3417@gmail.com> Co-authored-by: Thomas Parnell <tpa@zurich.ibm.com> Co-authored-by: Russell Bryant <rbryant@redhat.com> Co-authored-by: Lukas Geiger <lukas.geiger94@gmail.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Huzaifa Sidhpurwala <huzaifas@redhat.com> Co-authored-by: Federico <65908512+coval3nte@users.noreply.github.com> Co-authored-by: zixuanzhang226 <zixuanzhang@bytedance.com> Co-authored-by: wuhang <wuhang6@huawei.com> Co-authored-by: yzds <41983536+youzhedian@users.noreply.github.com> Co-authored-by: hongchao <hongchao@msh.team> Co-authored-by: czhu-cohere <conway.zhu@cohere.com> Co-authored-by: Wei <weiweinpu@gmail.com> Co-authored-by: Yiheng Xu <charlesyihengxu@gmail.com> Co-authored-by: Aaron Pham <contact@aarnphm.xyz> Co-authored-by: Chenheli Hua <huachenheli@outlook.com> Co-authored-by: CSWYF3634076 <58356743+CSWYF3634076@users.noreply.github.com> --- docs/models/supported_models.md | 2 +- tests/models/registry.py | 2 +- vllm/model_executor/models/minicpmv.py | 314 +++++++++++++++++- .../chat_templates/registry.py | 11 + .../chat_templates/template_minicpmv45.jinja | 93 ++++++ 5 files changed, 407 insertions(+), 15 deletions(-) create mode 100644 vllm/transformers_utils/chat_templates/template_minicpmv45.jinja diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 19ce8c06724f4..35a5fa0c2e42f 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -638,7 +638,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | ✅︎ | | `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I<sup>+</sup> + V<sup>+</sup> | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ | ✅︎ | | `MiniCPMO` | MiniCPM-O | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>E+</sup> | `openbmb/MiniCPM-o-2_6`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, `openbmb/MiniCPM-V-4`, etc. | ✅︎ | | ✅︎ | +| `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, `openbmb/MiniCPM-V-4`, `openbmb/MiniCPM-V-4_5`, etc. | ✅︎ | | ✅︎ | | `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + I<sup>E+</sup> | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | ✅︎ | | `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | ✅︎ | | `MllamaForConditionalGeneration` | Llama 3.2 | T + I<sup>+</sup> | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. | | | | diff --git a/tests/models/registry.py b/tests/models/registry.py index f2c09d3e8452a..ee546e7af85c6 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -451,7 +451,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { "MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6", trust_remote_code=True), "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5", - extras={"2.6": "openbmb/MiniCPM-V-2_6", "4.0": "openbmb/MiniCPM-V-4"}, # noqa: E501 + extras={"2.6": "openbmb/MiniCPM-V-2_6", "4.0": "openbmb/MiniCPM-V-4", "4.5": "openbmb/MiniCPM-V-4_5"}, # noqa: E501 trust_remote_code=True), "MiniMaxVL01ForConditionalGeneration": _HfExamplesInfo("MiniMaxAI/MiniMax-VL-01", # noqa: E501 trust_remote_code=True, diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index c22d871ab20d9..2d785c30fd7df 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -27,12 +27,14 @@ import math from collections import defaultdict from collections.abc import Iterable, Mapping, Sequence from functools import partial +from itertools import chain from typing import Annotated, Any, Callable, Literal, Optional, Union import numpy as np import torch import torch.types from torch import nn +from torch.nn.init import trunc_normal_ from transformers import BatchFeature, PretrainedConfig from typing_extensions import TypeVar @@ -47,10 +49,11 @@ from vllm.model_executor.models.llama import LlamaForCausalLM from vllm.model_executor.models.minicpm import MiniCPMForCausalLM from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM +from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems +from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - NestedTensors) + MultiModalKwargsItems, NestedTensors) from vllm.multimodal.parse import (DictEmbeddingItems, ImageItem, ImageProcessorItems, ImageSize, ModalityData, ModalityDataItems, @@ -218,6 +221,187 @@ class Resampler2_5(BaseResampler): return x +class Resampler4_5(Resampler2_5): + + def __init__(self, + num_queries: int, + embed_dim: int, + num_heads: int, + kv_dim: Optional[int] = None, + norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN, + max_size: tuple[int, int] = (70, 70), + max_temporal_size: int = 36000, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "") -> None: + super().__init__(num_queries, + embed_dim, + num_heads, + kv_dim, + norm_layer, + max_size, + quant_config=quant_config, + prefix=prefix) + + trunc_normal_(self.query, std=.02) + self.max_temporal_size = max_temporal_size + self._set_temporal_pos_cache(self.max_temporal_size) + self.apply(self._init_weights) + + def get_1d_sincos_pos_embed_from_temporal_size(self, embed_dim: int, + pos: np.ndarray): + """ + embed_dim: output dimension for each position + pos: a list of positions to be encoded: size (M,) + out: (M, D) + """ + assert embed_dim % 2 == 0 + omega = np.arange(embed_dim // 2, dtype=np.float32) + omega /= embed_dim / 2. + omega = 1. / 10000**omega # (D/2,) + + pos = pos.reshape(-1) # (M,) + out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product + + emb_sin = np.sin(out) # (M, D/2) + emb_cos = np.cos(out) # (M, D/2) + + emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) + return emb + + def _set_temporal_pos_cache(self, + max_temporal_size: int, + device: torch.types.Device = "cpu") -> None: + temporal_size = np.arange(max_temporal_size, dtype=np.float32) + pos_embed = torch.from_numpy( + self.get_1d_sincos_pos_embed_from_temporal_size( + self.embed_dim, temporal_size)).float().to(device) + self.register_buffer("temporal_pos_embed", pos_embed, persistent=False) + + def _adjust_temporal_pos_cache(self, + max_temporal_size: int, + device: torch.types.Device = "cpu"): + if max_temporal_size > self.max_temporal_size: + self.max_temporal_size = max_temporal_size + self._set_temporal_pos_cache(self.max_temporal_size, device) + + def _init_weights(self, m: Union[nn.Linear, nn.LayerNorm]): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def forward( + self, + x: torch.Tensor, + tgt_sizes: torch.Tensor, + # temporal_ids for high refresh rate videos + temporal_ids=None + ) -> torch.Tensor: + assert x.shape[0] == tgt_sizes.shape[0] + bs = x.shape[0] + + device = x.device + dtype = x.dtype + + patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1] + + self._adjust_pos_cache(tgt_sizes, device=device) + + temporal_pos_emb = False + temporal_ids_flatten = None + if temporal_ids is not None: + # example: [[-1], [-1], [2, 6, 9]] + temporal_ids_flatten = list(chain.from_iterable(temporal_ids)) + max_temporal_size = max(temporal_ids_flatten, default=0) + if max_temporal_size > -1: + temporal_pos_emb = True + if max_temporal_size > self.max_temporal_size: + self._adjust_temporal_pos_cache(max_temporal_size, device) + + max_patch_len = patch_len.max().item() + assert isinstance(max_patch_len, int) + + key_padding_mask = torch.zeros((bs, max_patch_len), + dtype=torch.bool, + device=device) + + x, _ = self.kv_proj(x) # B * L * D + x = self.ln_kv(x).permute(1, 0, 2) # L * B * D + q = self.ln_q(self.query) # Q * D + + pos_embed_2d = [] + pos_embed_temporal = [] + for i in range(bs): + tgt_h, tgt_w = tgt_sizes[i] + if temporal_pos_emb: + if temporal_ids_flatten[i] == -1: + pos_embed_temporal.append( + torch.zeros(self.embed_dim, dtype=dtype, + device=device)) + else: + pos_embed_temporal.append(self.temporal_pos_embed[ + temporal_ids_flatten[i]].to(dtype)) # D + + pos_embed_2d.append(self.pos_embed[:tgt_h, :tgt_w, :].reshape( + (tgt_h * tgt_w, -1)).to(dtype)) # patches * D + key_padding_mask[i, patch_len[i]:] = True + + pos_embed_2d = torch.nn.utils.rnn.pad_sequence( + pos_embed_2d, batch_first=True, + padding_value=0.0).permute(1, 0, 2) # BLD => L * B * D + + k = x + v = x + pos_embed_2d + if pos_embed_temporal: + k += torch.stack(pos_embed_temporal, dim=0) + bs = len(temporal_ids) + merge_k = [] + merge_v = [] + merge_key_padding_mask = [] + + start = 0 + for tp in temporal_ids: + end = start + len(tp) + # L * (end-start) * D -> (end-start) * L * D + # -> 1 * L*(end-start) * D + merge_k.append(k[:, start:end, :].permute(1, 0, 2).reshape( + -1, self.embed_dim)) + merge_v.append(v[:, start:end, :].permute(1, 0, 2).reshape( + -1, self.embed_dim)) + merge_key_padding_mask.append( + key_padding_mask[start:end, :].reshape(-1, 1)) + + start = end + + k = torch.nn.utils.rnn.pad_sequence(merge_k, + batch_first=True, + padding_value=0.0).permute( + 1, 0, 2) # L*(end-start) + v = torch.nn.utils.rnn.pad_sequence(merge_v, + batch_first=True, + padding_value=0.0).permute( + 1, 0, 2) # L*(end-start) + key_padding_mask = torch.nn.utils.rnn.pad_sequence( + merge_key_padding_mask, batch_first=True, + padding_value=True).squeeze(-1) + + out = self.attn( + self._repeat(q, bs), # Q * B * D + k, # L * B * D + L * B * D + v, + key_padding_mask=key_padding_mask, + )[0] + # out: Q * B * D + x = out.permute(1, 0, 2) # B * Q * D + + x = self.ln_post(x) + x = x @ self.proj + return x + + def get_version_by_config(config: PretrainedConfig) -> tuple[int, ...]: version_float = getattr(config, "version", None) @@ -354,9 +538,7 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: mm_limits = {"image": None} - if self.get_model_version() == (2, - 6) or self.get_model_version() == (4, - 0): + if self.get_model_version() in {(2, 6), (4, 0), (4, 5)}: mm_limits["video"] = None return mm_limits @@ -637,8 +819,7 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]): out_keys: set[str], ) -> dict[str, NestedTensors]: # This processor supports zipping prompt and mm_data together - if self.info.get_model_version() == ( - 2, 6) or self.info.get_model_version() == (4, 0): + if self.info.get_model_version() in {(2, 6), (4, 0), (4, 5)}: inputs = super()._call_hf_processor( prompt=prompts, # type: ignore mm_data=mm_data, @@ -816,7 +997,6 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP): # and config class self.config = config self.multimodal_config = multimodal_config - self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" self.version = get_version_by_config(self.config) self.llm = self.init_llm(vllm_config=vllm_config, @@ -1364,11 +1544,9 @@ class MiniCPMV4_0(MiniCPMVBaseModel, SupportsLoRA): prefix: str = "", ) -> nn.Module: quant_config = self._maybe_ignore_quant_config(quant_config) - model = Idefics2VisionTransformer( - config.vision_config, - quant_config=quant_config, - prefix=prefix, - use_data_parallel=self.use_data_parallel) + model = Idefics2VisionTransformer(config.vision_config, + quant_config=quant_config, + prefix=prefix) if self.config.drop_vision_last_layer: model.encoder.layers = model.encoder.layers[:-1] return model @@ -1436,11 +1614,121 @@ class MiniCPMV4_0(MiniCPMVBaseModel, SupportsLoRA): return loader.load_weights(weights) +class MiniCPMV4_5(MiniCPMVBaseModel, SupportsLoRA): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) + assert self.version == (4, 5) + + def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig): + if isinstance(quant_config, (AWQConfig, AWQMarlinConfig)): + return None + return quant_config + + def init_llm( + self, + vllm_config: VllmConfig, + prefix: str = "", + ) -> nn.Module: + return Qwen3ForCausalLM(vllm_config=vllm_config, prefix=prefix) + + def init_vision_module( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> nn.Module: + quant_config = self._maybe_ignore_quant_config(quant_config) + model = Idefics2VisionTransformer(config.vision_config, + quant_config=quant_config, + prefix=prefix) + if self.config.drop_vision_last_layer: + model.encoder.layers = model.encoder.layers[:-1] + return model + + def init_resampler( + self, + embed_dim: int, + vision_dim: int, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> nn.Module: + quant_config = self._maybe_ignore_quant_config(quant_config) + with set_default_torch_dtype(torch.float16): + # The resampler in 4.0 remains consistent with the one in 2.5/2.6. + resampler = Resampler4_5(num_queries=self.config.query_num, + embed_dim=embed_dim, + num_heads=embed_dim // 128, + kv_dim=vision_dim, + quant_config=quant_config, + prefix=prefix) + + return resampler.to(device=current_platform.device_type, + dtype=torch.get_default_dtype()) + + def get_vision_hidden_states( + self, data: MiniCPMVImagePixelInputs) -> torch.Tensor: + pixel_values = data["pixel_values"] + tgt_sizes = data["tgt_sizes"] + temporal_ids = data.get('temporal_ids', None) + + B = len(pixel_values) + P = pixel_values[0].shape[-2] + L = max(item.shape[-1] for item in pixel_values) + device = pixel_values[0].device + dtype = pixel_values[0].dtype + + all_pixel_values = torch.zeros((B, 3, P, L), + dtype=dtype, + device=device) + all_temporal_ids = None if temporal_ids is None else flatten_2d_lists( + temporal_ids) + for i, pixel_values_item in enumerate(pixel_values): + L_item = pixel_values_item.shape[-1] + all_pixel_values[i, ..., :L_item] = pixel_values_item + + num_patches = tgt_sizes.prod(-1) + max_patches = num_patches.max().item() + assert isinstance(max_patches, int) + + patch_attn_mask = torch.zeros((B, max_patches), + dtype=torch.bool, + device=device) + for i, num_patches_item in enumerate(num_patches): + patch_attn_mask[i, :num_patches_item] = True + + vision_embedding = self.vpm( + all_pixel_values, + patch_attention_mask=patch_attn_mask.unsqueeze(1), + tgt_sizes=tgt_sizes, + ) + + return self.resampler(vision_embedding, tgt_sizes, all_temporal_ids) + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader(self, + skip_prefixes=["apm.", "audio", "tts"]) + return loader.load_weights(weights) + + _SUPPORT_VERSION = { (2, 0): MiniCPMV2_0, (2, 5): MiniCPMV2_5, (2, 6): MiniCPMV2_6, (4, 0): MiniCPMV4_0, + (4, 5): MiniCPMV4_5, } diff --git a/vllm/transformers_utils/chat_templates/registry.py b/vllm/transformers_utils/chat_templates/registry.py index e0ef7f0999d47..d09c5fa924fb0 100644 --- a/vllm/transformers_utils/chat_templates/registry.py +++ b/vllm/transformers_utils/chat_templates/registry.py @@ -20,6 +20,16 @@ def _get_qwen_chat_template_fallback( return CHAT_TEMPLATES_DIR / "template_basic.jinja" +def _get_minicpmv_chat_template_fallback( + tokenizer_name_or_path: str) -> Optional[Path]: + # MiniCPM-V-4.5 version uses a dedicated template + if "4.5" in tokenizer_name_or_path or "4_5" in tokenizer_name_or_path: + return CHAT_TEMPLATES_DIR / "template_minicpmv45.jinja" + + # Other versions use chatml template + return CHAT_TEMPLATES_DIR / "template_chatml.jinja" + + # yapf: disable _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = { "blip-2": CHAT_TEMPLATES_DIR / "template_blip2.jinja", @@ -27,6 +37,7 @@ _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = { "deepseek_vl_v2": CHAT_TEMPLATES_DIR / "template_deepseek_vl2.jinja", "florence2": CHAT_TEMPLATES_DIR / "template_basic.jinja", "fuyu": CHAT_TEMPLATES_DIR / "template_fuyu.jinja", + "minicpmv": _get_minicpmv_chat_template_fallback, "paligemma": CHAT_TEMPLATES_DIR / "template_basic.jinja", "qwen": _get_qwen_chat_template_fallback, } diff --git a/vllm/transformers_utils/chat_templates/template_minicpmv45.jinja b/vllm/transformers_utils/chat_templates/template_minicpmv45.jinja new file mode 100644 index 0000000000000..661ebd1cf5c17 --- /dev/null +++ b/vllm/transformers_utils/chat_templates/template_minicpmv45.jinja @@ -0,0 +1,93 @@ +{%- set enable_thinking = enable_thinking | default(false) %} +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} + +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} + +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '</think>' in message.content %} + {%- set content = message.content.split('</think>')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '<tool_call>\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n</tool_call>' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n<tool_response>\n' }} + {{- message.content }} + {{- '\n</tool_response>' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} + +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '<think>\n\n</think>\n\n' }} + {%- endif %} + {%- if enable_thinking is defined and enable_thinking is true %} + {{- '<think>\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file From 8c13820f0b203976eab8e821c102234a73f338cd Mon Sep 17 00:00:00 2001 From: cndoit18 <cndoit18@outlook.com> Date: Wed, 27 Aug 2025 20:42:20 +0800 Subject: [PATCH 084/112] [Bugfix] Fix task field initialization when PYTHONOPTIMIZE is enabled (#23718) Signed-off-by: cndoit18 <cndoit18@outlook.com> --- vllm/worker/pooling_model_runner.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py index 8d8d9b4d0503f..3e1950798dbf6 100644 --- a/vllm/worker/pooling_model_runner.py +++ b/vllm/worker/pooling_model_runner.py @@ -199,8 +199,9 @@ class PoolingModelRunner( pooling_params = seq_group_metadata.pooling_params assert pooling_params is not None - assert (task := pooling_params.task) is not None, ( - "You did not set `task` in the API") + + task = pooling_params.task + assert task is not None, "You did not set `task` in the API" model = cast(VllmModelForPooling, self.model) to_update = model.pooler.get_pooling_updates(task) From a403d0fa41cc68e3b6da4e1097dc896fde2f1a6a Mon Sep 17 00:00:00 2001 From: Nick Hill <nhill@redhat.com> Date: Wed, 27 Aug 2025 05:50:47 -0700 Subject: [PATCH 085/112] [Misc] Remove unnecessary `_send_reconfig_message()` in `core_client.py` (#23127) Signed-off-by: Nick Hill <nhill@redhat.com> --- vllm/v1/engine/core_client.py | 31 +++++++++---------------------- 1 file changed, 9 insertions(+), 22 deletions(-) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 079dd9a7d38d1..65f7abc97110c 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -1190,21 +1190,6 @@ class DPLBAsyncMPClient(DPAsyncMPClient): await self._send_input(EngineCoreRequestType.ABORT, request_ids, engine) - async def _send_reconfig_message( - self, reconfig_request: ReconfigureDistributedRequest, - engine: EngineIdentity) -> asyncio.Future: - """Send reconfiguration message and return the result future without - waiting for completion.""" - call_id = uuid.uuid1().int >> 64 - future = asyncio.get_running_loop().create_future() - self.utility_results[call_id] = future - message = (EngineCoreRequestType.UTILITY.value, *self.encoder.encode( - (self.client_index, call_id, "reinitialize_distributed", - (reconfig_request, )))) - await self._send_input_message(message, engine, reconfig_request) - self._ensure_output_queue_task() - return future - async def scale_elastic_ep(self, new_data_parallel_size: int) -> None: """Scale elastic EP data parallel size""" cur_data_parallel_size = len(self.core_engines) @@ -1214,7 +1199,7 @@ class DPLBAsyncMPClient(DPAsyncMPClient): f"different from cur_data_parallel_size {cur_data_parallel_size}") assert self.vllm_config.parallel_config.data_parallel_backend == \ - "ray", ("Only ray DP backend supports scaling elastic EP") + "ray", "Only ray DP backend supports scaling elastic EP" scale_up = new_data_parallel_size > cur_data_parallel_size @@ -1246,9 +1231,10 @@ class DPLBAsyncMPClient(DPAsyncMPClient): data_parallel_master_ip, new_data_parallel_master_port=self.vllm_config.parallel_config. data_parallel_master_port) - future = await self._send_reconfig_message(reconfig_request, - engine) - reconfig_futures.append(future) + coro = self._call_utility_async("reinitialize_distributed", + reconfig_request, + engine=engine) + reconfig_futures.append(asyncio.create_task(coro)) logger.info("All reconfigure messages sent, starting engine creation") @@ -1318,9 +1304,10 @@ class DPLBAsyncMPClient(DPAsyncMPClient): if cur_dp_rank >= new_data_parallel_size: reconfig_request.new_data_parallel_rank = \ ReconfigureRankType.SHUTDOWN_CURRENT_RANK - future = await self._send_reconfig_message(reconfig_request, - engine) - reconfig_futures.append(future) + coro = self._call_utility_async("reinitialize_distributed", + reconfig_request, + engine=engine) + reconfig_futures.append(asyncio.create_task(coro)) for _ in range(new_data_parallel_size, cur_data_parallel_size): self.core_engines.pop() From 704432af3c129b7a57fca9b059eefe214159f836 Mon Sep 17 00:00:00 2001 From: Thomas Parnell <tpa@zurich.ibm.com> Date: Wed, 27 Aug 2025 14:51:54 +0200 Subject: [PATCH 086/112] [V1] [Hybrid] Disable prefix caching by default for hybrid or mamba-based models (#23716) Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> --- docs/usage/v1_guide.md | 10 ++++++---- vllm/model_executor/models/config.py | 9 +++++---- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md index 64bd0d9bf5071..20234e7611333 100644 --- a/docs/usage/v1_guide.md +++ b/docs/usage/v1_guide.md @@ -107,14 +107,16 @@ to enable simultaneous generation and embedding using the same engine instance i #### Mamba Models Models using selective state-space mechanisms instead of standard transformer attention are supported. -Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`) are supported. Please note that these models currently require disabling prefix caching in V1. +Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`) are supported. +Please note that prefix caching is not yet supported for these models. Models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`, -`Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`). Please note that -these models currently require disabling prefix caching in V1. +`Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`). +Please note that prefix caching is not yet supported for these models. Hybrid models with mechanisms different to Mamba are also supported (e.g, `MiniMaxText01ForCausalLM`, `MiniMaxM1ForCausalLM`). -Please note that these models currently require disabling prefix caching and enforcing eager mode in V1. +Please note that prefix caching is not yet supported for these models. +It is also necessary to enforce eager mode for these models in V1. #### Encoder-Decoder Models diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index f62209326b988..88b3154de2cbb 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -292,12 +292,13 @@ class MambaModelConfig(VerifyAndUpdateConfig): return model_config = vllm_config.model_config + cache_config = vllm_config.cache_config compilation_config = vllm_config.compilation_config - model_cls, _ = ModelRegistry.resolve_model_cls( - model_config.architecture, - model_config=model_config, - ) + # TODO(tdoublep): remove once prefix caching is enabled + cache_config.enable_prefix_caching = False + logger.info("Hybrid or mamba-based model detected: disabling prefix " + "caching since it is not yet supported.") # TODO(tdoublep): remove as full cuda graph support is added FCG_NOT_SUPPORTED_MODELS = [ From 5eeef1b90852917b300ed67b98e341eb846ba2e9 Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Wed, 27 Aug 2025 21:24:09 +0800 Subject: [PATCH 087/112] [Model] Explicit `default_pooling_type` interface (#23736) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- vllm/model_executor/models/bert.py | 4 +-- vllm/model_executor/models/bert_with_rope.py | 5 ++-- vllm/model_executor/models/gritlm.py | 2 +- vllm/model_executor/models/interfaces.py | 19 +------------ vllm/model_executor/models/interfaces_base.py | 28 +++++++++++++++++++ vllm/model_executor/models/internlm2.py | 3 +- vllm/model_executor/models/modernbert.py | 3 +- .../models/prithvi_geospatial_mae.py | 7 +++-- vllm/model_executor/models/qwen2_rm.py | 3 +- vllm/model_executor/models/registry.py | 7 +++-- vllm/model_executor/models/roberta.py | 3 +- 11 files changed, 51 insertions(+), 33 deletions(-) diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 22b6c4401213c..b34ca5cbe963d 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -28,8 +28,8 @@ from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.sequence import IntermediateTensors from vllm.tasks import PoolingTask -from .interfaces import (SupportsCrossEncoding, SupportsQuant, - default_pooling_type) +from .interfaces import SupportsCrossEncoding, SupportsQuant +from .interfaces_base import default_pooling_type from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index 129450927e564..dcb7e75456cde 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -27,13 +27,14 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models.interfaces import (SupportsQuant, - default_pooling_type) from vllm.model_executor.models.utils import WeightsMapper from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors +from .interfaces import SupportsQuant +from .interfaces_base import default_pooling_type + class BertWithRopeEmbedding(nn.Module): diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py index 3f6790269ae62..1b3d541c65cf8 100644 --- a/vllm/model_executor/models/gritlm.py +++ b/vllm/model_executor/models/gritlm.py @@ -20,7 +20,7 @@ from vllm.sequence import PoolerOutput from vllm.tasks import PoolingTask from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config -from .interfaces import default_pooling_type +from .interfaces_base import default_pooling_type logger = init_logger(__name__) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 9415e67924e74..22f005849e864 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -3,7 +3,7 @@ from collections.abc import Iterable, Mapping, MutableSequence from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol, - TypeVar, Union, overload, runtime_checkable) + Union, overload, runtime_checkable) import numpy as np import torch @@ -641,23 +641,6 @@ def supports_cross_encoding( return is_pooling_model(model) and _supports_cross_encoding(model) -_T = TypeVar("_T", bound=type[torch.nn.Module]) - - -def default_pooling_type(pooling_type: str): - """Set default_pooling_type decorator. """ - - def func(model: _T) -> _T: - model.default_pooling_type = pooling_type # type: ignore - return model - - return func - - -def get_default_pooling_type(model: Union[type[object], object]) -> str: - return getattr(model, "default_pooling_type", "LAST") - - class SupportsQuant: """The interface required for all models that support quantization.""" diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py index 697fa020deb46..19a3ef1a3b800 100644 --- a/vllm/model_executor/models/interfaces_base.py +++ b/vllm/model_executor/models/interfaces_base.py @@ -144,6 +144,17 @@ class VllmModelForPooling(VllmModel[T_co], Protocol[T_co]): MRO of your model class. """ + default_pooling_type: ClassVar[str] = "LAST" + """ + Indicates the + [vllm.model_executor.layers.pooler.PoolerConfig.pooling_type][] + to use by default. + + You can use the + [vllm.model_executor.models.interfaces_base.default_pooling_type][] + decorator to conveniently set this field. + """ + pooler: Pooler """The pooler is only called on TP rank 0.""" @@ -165,3 +176,20 @@ def is_pooling_model( return False return getattr(model, "is_pooling_model", False) + + +_T = TypeVar("_T", bound=type[nn.Module]) + + +def default_pooling_type(pooling_type: str): + """Decorator to set `VllmModelForPooling.default_pooling_type`.""" + + def func(model: _T) -> _T: + model.default_pooling_type = pooling_type # type: ignore + return model + + return func + + +def get_default_pooling_type(model: Union[type[object], object]) -> str: + return getattr(model, "default_pooling_type", "LAST") diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index d0c4bf5450d6d..26bc48ffbd9bc 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -31,7 +31,8 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from .interfaces import SupportsLoRA, SupportsPP, default_pooling_type +from .interfaces import SupportsLoRA, SupportsPP +from .interfaces_base import default_pooling_type from .utils import (is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index 72290bf2ee29f..4778555861286 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -26,7 +26,8 @@ from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.sequence import IntermediateTensors from vllm.tasks import PoolingTask -from .interfaces import SupportsCrossEncoding, default_pooling_type +from .interfaces import SupportsCrossEncoding +from .interfaces_base import default_pooling_type from .utils import WeightsMapper, maybe_prefix diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py index 59e9f3e8a47b0..f46d6375e1f61 100644 --- a/vllm/model_executor/models/prithvi_geospatial_mae.py +++ b/vllm/model_executor/models/prithvi_geospatial_mae.py @@ -27,9 +27,6 @@ from transformers import BatchFeature from vllm.config import VllmConfig from vllm.model_executor.layers.pooler import DispatchPooler, Pooler from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models.interfaces import ( - IsAttentionFree, MultiModalEmbeddings, SupportsMultiModalWithRawInput, - default_pooling_type) from vllm.model_executor.models.utils import AutoWeightsLoader from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (ImageItem, ModalityData, @@ -43,6 +40,10 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors +from .interfaces import (IsAttentionFree, MultiModalEmbeddings, + SupportsMultiModalWithRawInput) +from .interfaces_base import default_pooling_type + def _prithvi_field_config(hf_inputs: Mapping[str, torch.Tensor]): # This model receives in input a multi-dimensional tensor representing diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py index e0a30e04c602a..421b43563bade 100644 --- a/vllm/model_executor/models/qwen2_rm.py +++ b/vllm/model_executor/models/qwen2_rm.py @@ -18,7 +18,8 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, from vllm.model_executor.layers.pooler import DispatchPooler, Pooler from vllm.sequence import IntermediateTensors -from .interfaces import SupportsLoRA, SupportsPP, default_pooling_type +from .interfaces import SupportsLoRA, SupportsPP +from .interfaces_base import default_pooling_type from .qwen2 import Qwen2Model from .utils import AutoWeightsLoader, maybe_prefix diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index c65c58d4a047f..196b5f35e1e4f 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -25,11 +25,12 @@ from vllm.logger import init_logger from vllm.transformers_utils.dynamic_module import ( try_get_class_from_dynamic_module) -from .interfaces import (get_default_pooling_type, has_inner_state, has_noops, - is_attention_free, is_hybrid, supports_cross_encoding, +from .interfaces import (has_inner_state, has_noops, is_attention_free, + is_hybrid, supports_cross_encoding, supports_multimodal, supports_multimodal_raw_input, supports_pp, supports_transcription, supports_v0_only) -from .interfaces_base import is_pooling_model, is_text_generation_model +from .interfaces_base import (get_default_pooling_type, is_pooling_model, + is_text_generation_model) logger = init_logger(__name__) diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index 49a37342c67fa..2bfa51162910b 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -22,7 +22,8 @@ from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper, from vllm.sequence import IntermediateTensors from .bert_with_rope import BertWithRope, JinaRobertaModel -from .interfaces import SupportsCrossEncoding, default_pooling_type +from .interfaces import SupportsCrossEncoding +from .interfaces_base import default_pooling_type class RobertaEmbedding(nn.Module): From 8dd2baa5978f123974177023d6efab731153a2f4 Mon Sep 17 00:00:00 2001 From: rebel-hongseok <hongseok@rebellions.ai> Date: Wed, 27 Aug 2025 22:25:49 +0900 Subject: [PATCH 088/112] Add vLLM Korea Meetup in the README.md and meetups.md (#23746) Signed-off-by: rebel-hongseok <hongseok@rebellions.ai> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- README.md | 1 + docs/community/meetups.md | 1 + 2 files changed, 2 insertions(+) diff --git a/README.md b/README.md index ef5b43588953c..8812aac4ea266 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ Easy, fast, and cheap LLM serving for everyone *Latest News* 🔥 - [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH). +- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view). - [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152). - [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/). - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html). diff --git a/docs/community/meetups.md b/docs/community/meetups.md index 61ea44220ad2e..d76238cb31791 100644 --- a/docs/community/meetups.md +++ b/docs/community/meetups.md @@ -3,6 +3,7 @@ We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: - [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg), August 23rd 2025. [[Slides]](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH) +- [vLLM Korea Meetup](https://luma.com/cgcgprmh), August 19th 2025. [[Slides]](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view). - [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA), August 2nd 2025. [[Slides]](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) [[Recording]](https://www.chaspark.com/#/live/1166916873711665152). - [NYC vLLM Meetup](https://lu.ma/c1rqyf1f), May 7th, 2025. [[Slides]](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing) - [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day), April 3rd 2025. [[Slides]](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing). From 16dc4052b004261b547fc50fe7b20e2d2fbf915d Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 27 Aug 2025 14:39:48 +0100 Subject: [PATCH 089/112] Fix pre-commit on main (#23747) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/community/meetups.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/community/meetups.md b/docs/community/meetups.md index d76238cb31791..221a7bd96213f 100644 --- a/docs/community/meetups.md +++ b/docs/community/meetups.md @@ -3,7 +3,7 @@ We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: - [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg), August 23rd 2025. [[Slides]](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH) -- [vLLM Korea Meetup](https://luma.com/cgcgprmh), August 19th 2025. [[Slides]](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view). +- [vLLM Korea Meetup](https://luma.com/cgcgprmh), August 19th 2025. [[Slides]](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view). - [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA), August 2nd 2025. [[Slides]](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) [[Recording]](https://www.chaspark.com/#/live/1166916873711665152). - [NYC vLLM Meetup](https://lu.ma/c1rqyf1f), May 7th, 2025. [[Slides]](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing) - [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day), April 3rd 2025. [[Slides]](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing). From fe8d7b6f03e7d8a36ffb6931397fc81ee594dd64 Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Wed, 27 Aug 2025 21:41:22 +0800 Subject: [PATCH 090/112] [Model] Interface to enable batch-level DP support (#23733) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- docs/configuration/optimization.md | 7 +++++-- vllm/config/__init__.py | 7 +++++++ vllm/model_executor/models/interfaces.py | 11 +++++++++++ vllm/model_executor/models/minicpmv.py | 2 ++ vllm/model_executor/models/mllama4.py | 2 ++ vllm/model_executor/models/qwen2_5_vl.py | 2 ++ vllm/model_executor/models/registry.py | 9 +++++++-- vllm/model_executor/models/step3_vl.py | 2 ++ 8 files changed, 38 insertions(+), 4 deletions(-) diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index a8eab9985c8b9..b11ccb5c00273 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -168,8 +168,11 @@ llm = LLM( Batch-level DP is not to be confused with API request-level DP (which is instead controlled by `data_parallel_size`). -The availability of batch-level DP is based on model implementation. -Currently, the following models support `mm_encoder_tp_mode="data"`: +Batch-level DP needs to be implemented on a per-model basis, +and enabled by setting `supports_encoder_tp_data = True` in the model class. +Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to use this feature. + +Known supported models: - Llama4 (<gh-pr:18368>) - MiniCPM-V-4 (<gh-pr:23327>) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index ac6f51df95498..e3fb6d796def5 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -872,6 +872,13 @@ class ModelConfig: def _init_multimodal_config(self) -> Optional["MultiModalConfig"]: if self._model_info.supports_multimodal: + if (self.mm_encoder_tp_mode == "data" and + not self._model_info.supports_multimodal_encoder_tp_data): + logger.warning_once( + "This model does not support `--mm-encoder-tp-mode data`. " + "Falling back to `--mm-encoder-tp-mode weights`.") + self.mm_encoder_tp_mode = "weights" + return MultiModalConfig( limit_per_prompt=self.limit_mm_per_prompt, media_io_kwargs=self.media_io_kwargs, diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 22f005849e864..506732fed3614 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -52,6 +52,12 @@ class SupportsMultiModal(Protocol): MRO of your model class. """ + supports_encoder_tp_data: ClassVar[bool] = False + """ + A flag that indicates whether this model supports + `multimodal_config.mm_encoder_tp_mode="data"`. + """ + @classmethod def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: """ @@ -137,6 +143,11 @@ def supports_multimodal( return getattr(model, "supports_multimodal", False) +def supports_multimodal_encoder_tp_data( + model: Union[type[object], object]) -> bool: + return getattr(model, "supports_encoder_tp_data", False) + + @runtime_checkable class SupportsMultiModalWithRawInput(SupportsMultiModal, Protocol): """The interface required for all multi-modal models.""" diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 2d785c30fd7df..0181bfeebda08 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -1521,6 +1521,8 @@ class MiniCPMV4_0(MiniCPMVBaseModel, SupportsLoRA): ], } + supports_encoder_tp_data = True + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__(vllm_config=vllm_config, prefix=prefix) assert self.version == (4, 0) diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 595bdd17cf2c2..ac9b968f7a0cd 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -716,6 +716,8 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal, "gate_up_proj": ["gate_proj", "up_proj"], } + supports_encoder_tp_data = True + @classmethod def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: if modality.startswith("image"): diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 648ba81eb3877..b528083b7c9cc 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -868,6 +868,8 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, "model.": "language_model.model.", }) + supports_encoder_tp_data = True + @classmethod def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: if modality.startswith("image"): diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 196b5f35e1e4f..80eac78cdfadb 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -27,8 +27,10 @@ from vllm.transformers_utils.dynamic_module import ( from .interfaces import (has_inner_state, has_noops, is_attention_free, is_hybrid, supports_cross_encoding, - supports_multimodal, supports_multimodal_raw_input, - supports_pp, supports_transcription, supports_v0_only) + supports_multimodal, + supports_multimodal_encoder_tp_data, + supports_multimodal_raw_input, supports_pp, + supports_transcription, supports_v0_only) from .interfaces_base import (get_default_pooling_type, is_pooling_model, is_text_generation_model) @@ -324,6 +326,7 @@ class _ModelInfo: supports_cross_encoding: bool supports_multimodal: bool supports_multimodal_raw_input: bool + supports_multimodal_encoder_tp_data: bool supports_pp: bool has_inner_state: bool is_attention_free: bool @@ -343,6 +346,8 @@ class _ModelInfo: supports_cross_encoding=supports_cross_encoding(model), supports_multimodal=supports_multimodal(model), supports_multimodal_raw_input=supports_multimodal_raw_input(model), + supports_multimodal_encoder_tp_data= + supports_multimodal_encoder_tp_data(model), supports_pp=supports_pp(model), has_inner_state=has_inner_state(model), is_attention_free=is_attention_free(model), diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py index f8877b584b198..f379d2c15fb6c 100644 --- a/vllm/model_executor/models/step3_vl.py +++ b/vllm/model_executor/models/step3_vl.py @@ -867,6 +867,8 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, "lm_head.": "language_model.lm_head.", }) + supports_encoder_tp_data = True + @classmethod def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: if modality.startswith("image"): From 513c1fe255f7d4ec3e91f7f5c2dd2d97c0460765 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 27 Aug 2025 14:55:12 +0100 Subject: [PATCH 091/112] Only run `get_attr_docs` if generating help text (#23723) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/engine/arg_utils.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 9e7c95ea5205f..3399d505e3631 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -152,9 +152,17 @@ def is_online_quantization(quantization: Any) -> bool: return quantization in ["inc"] +NEEDS_HELP = ( + "--help" in (argv := sys.argv) # vllm SUBCOMMAND --help + or (argv0 := argv[0]).endswith("mkdocs") # mkdocs SUBCOMMAND + or argv0.endswith("mkdocs/__main__.py") # python -m mkdocs SUBCOMMAND +) + + @functools.lru_cache(maxsize=30) def _compute_kwargs(cls: ConfigType) -> dict[str, Any]: - cls_docs = get_attr_docs(cls) + # Save time only getting attr docs if we're generating help text + cls_docs = get_attr_docs(cls) if NEEDS_HELP else {} kwargs = {} for field in fields(cls): # Get the set of possible types for the field @@ -172,7 +180,7 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, Any]: # Get the help text for the field name = field.name - help = cls_docs[name].strip() + help = cls_docs.get(name, "").strip() # Escape % for argparse help = help.replace("%", "%%") @@ -254,6 +262,9 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, Any]: def get_kwargs(cls: ConfigType) -> dict[str, Any]: """Return argparse kwargs for the given Config dataclass. + If `--help` or `mkdocs` are not present in the command line command, the + attribute documentation will not be included in the help output. + The heavy computation is cached via functools.lru_cache, and a deep copy is returned so callers can mutate the dictionary without affecting the cached version. From 3af47c3cc693f432b59658019891393385aa0e2a Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Wed, 27 Aug 2025 10:09:08 -0400 Subject: [PATCH 092/112] [Feature] Add Hopper DeepGEMM E8M0 for DeepSeekV3.1 scale_fmt (#23666) Signed-off-by: yewentao256 <zhyanwentao@126.com> Signed-off-by: youkaichao <youkaichao@gmail.com> Co-authored-by: youkaichao <youkaichao@gmail.com> --- tests/kernels/moe/test_block_fp8.py | 5 +- tests/kernels/moe/test_deepep_deepgemm_moe.py | 7 ++- vllm/envs.py | 8 ++- .../layers/fused_moe/batched_deep_gemm_moe.py | 4 +- .../layers/fused_moe/fused_moe.py | 7 ++- .../layers/fused_moe/triton_deep_gemm_moe.py | 6 +-- .../model_executor/layers/quantization/fp8.py | 9 ++-- .../layers/quantization/utils/fp8_utils.py | 4 +- vllm/transformers_utils/config.py | 18 +++++++ vllm/utils/deep_gemm.py | 53 +++++++++---------- 10 files changed, 68 insertions(+), 53 deletions(-) diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py index 9e4eaf221f245..ecc57acc67963 100644 --- a/tests/kernels/moe/test_block_fp8.py +++ b/tests/kernels/moe/test_block_fp8.py @@ -16,7 +16,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import ( fused_topk, modular_triton_fused_moe) from vllm.platforms import current_platform from vllm.utils import has_deep_gemm -from vllm.utils.deep_gemm import is_blackwell_deep_gemm_e8m0_used +from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used dg_available = has_deep_gemm() @@ -226,8 +226,7 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed, @pytest.mark.parametrize("topk", TOP_KS) @pytest.mark.parametrize("seed", SEEDS) @pytest.mark.skipif(not dg_available, reason="DeepGemm kernels not available.") -@pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(), - reason="Not E8M0 scale MOE") +@pytest.mark.skipif(is_deep_gemm_e8m0_used(), reason="Not E8M0 scale MOE") @torch.inference_mode() def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed, monkeypatch): diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py index 1e922be47f2b4..36a98522a6588 100644 --- a/tests/kernels/moe/test_deepep_deepgemm_moe.py +++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py @@ -20,8 +20,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import ( FusedMoEModularKernel) from vllm.platforms import current_platform from vllm.utils import has_deep_ep, has_deep_gemm -from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_e8m0_used, - is_deep_gemm_supported) +from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used, is_deep_gemm_supported from ...utils import multi_gpu_test from .parallel_utils import ProcessGroupInfo, parallel_launch @@ -374,7 +373,7 @@ NUM_EXPERTS = [32] @multi_gpu_test(num_gpus=2) @requires_deep_ep @requires_deep_gemm -@pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(), +@pytest.mark.skipif(is_deep_gemm_e8m0_used(), reason="Skipping test for Blackwell DeepGEMM") def test_ht_deepep_deepgemm_moe(mnk: tuple[int, int, int], num_experts: int, topk: int, world_dp_size: tuple[int, int]): @@ -432,7 +431,7 @@ USE_FP8_DISPATCH = [False] @multi_gpu_test(num_gpus=2) @requires_deep_ep @requires_deep_gemm -@pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(), +@pytest.mark.skipif(is_deep_gemm_e8m0_used(), reason="Skipping test for Blackwell DeepGEMM") def test_ll_deepep_deepgemm_moe( mnk: tuple[int, int, int], diff --git a/vllm/envs.py b/vllm/envs.py index 66c7c2c7f2c4d..35735b552575b 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -131,6 +131,7 @@ if TYPE_CHECKING: VLLM_TPU_USING_PATHWAYS: bool = False VLLM_USE_DEEP_GEMM: bool = False VLLM_USE_DEEP_GEMM_E8M0: bool = True + VLLM_USE_DEEP_GEMM_E8M0_HOPPER: bool = False VLLM_SKIP_DEEP_GEMM_WARMUP: bool = False VLLM_USE_FUSED_MOE_GROUPED_TOPK: bool = True VLLM_USE_FLASHINFER_MOE_FP8: bool = False @@ -954,9 +955,12 @@ environment_variables: dict[str, Callable[[], Any]] = { lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))), # Whether to use E8M0 scaling when DeepGEMM is used on Blackwell GPUs. - # E8M0 is faster on B200 but may reduce accuracy. "VLLM_USE_DEEP_GEMM_E8M0": lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM_E8M0", "1"))), + # TODO(wentao): unify the two E8M0 flags after verifying the correctness. + # Whether to use E8M0 scaling when DeepGEMM is used on Hopper GPUs. + "VLLM_USE_DEEP_GEMM_E8M0_HOPPER": + lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM_E8M0_HOPPER", "0"))), # DeepGemm JITs the kernels on-demand. The warmup attempts to make DeepGemm # JIT all the required kernels before model execution so there is no # JIT'ing in the hot-path. However, this warmup increases the engine @@ -1244,6 +1248,8 @@ def compute_hash() -> str: "VLLM_USE_FLASHINFER_SAMPLER", "VLLM_DISABLED_KERNELS", "VLLM_USE_DEEP_GEMM", + "VLLM_USE_DEEP_GEMM_E8M0", + "VLLM_USE_DEEP_GEMM_E8M0_HOPPER", "VLLM_USE_TRTLLM_FP4_GEMM", "VLLM_USE_FUSED_MOE_GROUPED_TOPK", "VLLM_USE_FLASHINFER_MOE_FP8", diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py index c4d680af932f0..a5326dfe84f6d 100644 --- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py @@ -12,7 +12,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( from vllm.model_executor.layers.fused_moe.utils import _resize_cache from vllm.triton_utils import tl, triton from vllm.utils.deep_gemm import (fp8_m_grouped_gemm_nt_masked, - is_blackwell_deep_gemm_e8m0_used) + is_deep_gemm_e8m0_used) logger = init_logger(__name__) @@ -174,7 +174,7 @@ def silu_mul_fp8_quant_deep_gemm( eps, fp8_min, fp8_max, - is_blackwell_deep_gemm_e8m0_used(), + is_deep_gemm_e8m0_used(), BLOCK=group_size, NUM_STAGES=4, num_warps=1, diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 84dafcf00d821..17a5c735a57fe 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -40,7 +40,7 @@ from vllm.model_executor.layers.quantization.utils.mxfp4_utils import ( from vllm.platforms import current_platform from vllm.triton_utils import tl, triton from vllm.utils import direct_register_custom_op, is_torch_equal_or_newer -from vllm.utils.deep_gemm import is_blackwell_deep_gemm_e8m0_used +from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used from .rocm_aiter_fused_moe import is_rocm_aiter_moe_enabled @@ -1431,9 +1431,8 @@ def fused_experts(hidden_states: torch.Tensor, # E8M0 scale, which means we requantize the weight and input to the specific # scale. Fallen back to cutlass or triton for some cases would cause # accuracy issue. - if (allow_deep_gemm and use_fp8_w8a8 - and (is_blackwell_deep_gemm_e8m0_used() - or _valid_deep_gemm(hidden_states, w1, w2))): + if (allow_deep_gemm and use_fp8_w8a8 and + (is_deep_gemm_e8m0_used() or _valid_deep_gemm(hidden_states, w1, w2))): assert apply_router_weight_on_input is False assert is_act_and_mul, ( "DeepGemm only supports is_act_and_mul=True for now.") diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py index 486ca881df48c..6cd81d97f0298 100644 --- a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py @@ -10,7 +10,7 @@ from vllm.model_executor.layers.fused_moe.deep_gemm_moe import ( DeepGemmExperts, _valid_deep_gemm, _valid_deep_gemm_shape, deep_gemm_block_shape) from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts -from vllm.utils.deep_gemm import is_blackwell_deep_gemm_e8m0_used +from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): @@ -107,7 +107,7 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): # Note: the deep gemm workspaces are strictly larger than the triton # workspaces so we can be pessimistic here and allocate for DeepGemm # even if we fall back to triton later, e.g. if expert maps are set. - if self.allow_deep_gemm and (is_blackwell_deep_gemm_e8m0_used() + if self.allow_deep_gemm and (is_deep_gemm_e8m0_used() or _valid_deep_gemm_shape(M, N, K)): assert self.deep_gemm_expert is not None return self.deep_gemm_expert.workspace_shapes( @@ -143,7 +143,7 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): ): use_deep_gemm = (self.allow_deep_gemm and (_valid_deep_gemm(hidden_states, w1, w2) - or is_blackwell_deep_gemm_e8m0_used())) + or is_deep_gemm_e8m0_used())) experts = self.deep_gemm_expert if use_deep_gemm else self.triton_expert assert experts is not None diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index d45d368b582df..be358cfa949f0 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -48,8 +48,7 @@ from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.scalar_type import scalar_types from vllm.utils import has_deep_gemm -from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_e8m0_used, - is_deep_gemm_supported) +from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used, is_deep_gemm_supported from vllm.utils.flashinfer import has_flashinfer_moe if TYPE_CHECKING: @@ -427,7 +426,7 @@ class Fp8LinearMethod(LinearMethodBase): # On B200, if E8M0 for DeepGemm is used, we need to # requantize the weight and input to the specific scale # at the same time. - if is_blackwell_deep_gemm_e8m0_used(): + if is_deep_gemm_e8m0_used(): assert layer.weight_block_size is not None block_sz = tuple(layer.weight_block_size) requant_weight_ue8m0_inplace( @@ -734,7 +733,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): # DeepGemm scales need to be transposed and aligned. We try to do # it ahead of time for performance reasons. - if self.allow_deep_gemm and not is_blackwell_deep_gemm_e8m0_used(): + if self.allow_deep_gemm and not is_deep_gemm_e8m0_used(): # Lazy import to avoid CUDA initialization problems. if _is_col_major(layer.w13_weight_scale_inv): layer.w13_weight_scale_inv = \ @@ -871,7 +870,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): del layer.w13_input_scale del layer.w2_input_scale - if is_blackwell_deep_gemm_e8m0_used(): + if is_deep_gemm_e8m0_used(): assert layer.weight_block_size is not None # Re-quantise the expert weights so their scales are UE8M0. block_sz = tuple(layer.weight_block_size) diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index ab1d5383f4651..7b324dce3c367 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -20,7 +20,7 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( from vllm.platforms import current_platform from vllm.triton_utils import tl, triton from vllm.utils import cdiv, direct_register_custom_op -from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_e8m0_used, +from vllm.utils.deep_gemm import (is_deep_gemm_e8m0_used, should_use_deepgemm_for_fp8_linear) logger = init_logger(__name__) @@ -385,7 +385,7 @@ def per_token_group_quant_fp8( scaling factor. """ if use_ue8m0 is None: - use_ue8m0 = is_blackwell_deep_gemm_e8m0_used() + use_ue8m0 = is_deep_gemm_e8m0_used() dtype = current_platform.fp8_dtype() if dtype is None else dtype assert (x.shape[-1] % group_size == 0), ( f"the last dimension of `x` {x.shape[-1]} must be divisible " diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 2cd799e5eb5a9..bec792465bfbb 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -501,6 +501,24 @@ def get_config( if quantization_config is not None: config.quantization_config = quantization_config + # auto-enable DeepGEMM UE8M0 on Hopper if model config requests it + scale_fmt = quantization_config.get("scale_fmt", None) + if scale_fmt in ("ue8m0", ): + if not envs.is_set("VLLM_USE_DEEP_GEMM_E8M0_HOPPER"): + os.environ["VLLM_USE_DEEP_GEMM_E8M0_HOPPER"] = "1" + logger.info_once( + ("Detected quantization_config.scale_fmt=%s; " + "enabling Hopper UE8M0."), + scale_fmt, + ) + elif not envs.VLLM_USE_DEEP_GEMM_E8M0_HOPPER: + logger.warning_once( + ("Model config requests UE8M0 " + "(quantization_config.scale_fmt=%s), but " + "VLLM_USE_DEEP_GEMM_E8M0_HOPPER=0 is set; " + "Hopper UE8M0 disabled."), + scale_fmt, + ) if hf_overrides_kw: logger.debug("Overriding HF config with %s", hf_overrides_kw) diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index b0bc3a79eb0ad..cd1dbfb813fee 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -31,34 +31,33 @@ def is_deep_gemm_supported() -> bool: @functools.cache -def is_blackwell_deep_gemm_e8m0_used() -> bool: +def is_deep_gemm_e8m0_used() -> bool: """Return ``True`` if vLLM is configured to use DeepGEMM " - "E8M0 scale on a Blackwell-class GPU. + "E8M0 scale on a Hopper or Blackwell-class GPU. """ if not is_deep_gemm_supported(): - logger.debug_once( + logger.info_once( "DeepGEMM E8M0 disabled: DeepGEMM not supported on this system.") return False - if not envs.VLLM_USE_DEEP_GEMM_E8M0: - logger.debug_once("DeepGEMM E8M0 disabled: VLLM_USE_DEEP_GEMM_E8M0=0.") - return False - _lazy_init() if _fp8_gemm_nt_impl is None: - logger.debug_once( - "DeepGEMM E8M0 disabled: _fp8_gemm_nt_impl not found") + logger.info_once("DeepGEMM E8M0 disabled: _fp8_gemm_nt_impl not found") return False - enabled = (current_platform.is_cuda() - and current_platform.has_device_capability(100)) - if enabled: - logger.debug_once("DeepGEMM E8M0 enabled on Blackwell GPU.") - else: - logger.debug_once( - "DeepGEMM E8M0 disabled: not running on Blackwell GPU.") - return enabled + if current_platform.is_device_capability(100) and \ + envs.VLLM_USE_DEEP_GEMM_E8M0: + logger.info_once("DeepGEMM E8M0 enabled on Blackwell GPU.") + return True + + if current_platform.is_device_capability(90) and \ + envs.VLLM_USE_DEEP_GEMM_E8M0_HOPPER: + logger.info_once("DeepGEMM E8M0 enabled on Hopper GPU.") + return True + + logger.info_once("DeepGEMM E8M0 disabled on current configuration.") + return False def _missing(*_: Any, **__: Any) -> NoReturn: @@ -124,20 +123,18 @@ def fp8_gemm_nt(*args, **kwargs): _lazy_init() if _fp8_gemm_nt_impl is None: return _missing(*args, **kwargs) - return _fp8_gemm_nt_impl( - *args, - disable_ue8m0_cast=not is_blackwell_deep_gemm_e8m0_used(), - **kwargs) + return _fp8_gemm_nt_impl(*args, + disable_ue8m0_cast=not is_deep_gemm_e8m0_used(), + **kwargs) def m_grouped_fp8_gemm_nt_contiguous(*args, **kwargs): _lazy_init() if _grouped_impl is None: return _missing(*args, **kwargs) - return _grouped_impl( - *args, - disable_ue8m0_cast=not is_blackwell_deep_gemm_e8m0_used(), - **kwargs) + return _grouped_impl(*args, + disable_ue8m0_cast=not is_deep_gemm_e8m0_used(), + **kwargs) def fp8_m_grouped_gemm_nt_masked(*args, **kwargs): @@ -145,9 +142,7 @@ def fp8_m_grouped_gemm_nt_masked(*args, **kwargs): if _grouped_masked_impl is None: return _missing(*args, **kwargs) return _grouped_masked_impl( - *args, - disable_ue8m0_cast=not is_blackwell_deep_gemm_e8m0_used(), - **kwargs) + *args, disable_ue8m0_cast=not is_deep_gemm_e8m0_used(), **kwargs) def _ceil_to_ue8m0(x: torch.Tensor): @@ -211,7 +206,7 @@ __all__ = [ "m_grouped_fp8_gemm_nt_contiguous", "fp8_m_grouped_gemm_nt_masked", "per_block_cast_to_fp8", - "is_blackwell_deep_gemm_e8m0_used", + "is_deep_gemm_e8m0_used", "is_deep_gemm_supported", "should_use_deepgemm_for_fp8_linear", ] From 841490434aaee4b1c8d8427112af740b6662f384 Mon Sep 17 00:00:00 2001 From: Isotr0py <mozf@mail2.sysu.edu.cn> Date: Wed, 27 Aug 2025 22:45:17 +0800 Subject: [PATCH 093/112] [Model] Enable native HF format InternVL support (#23742) Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> --- docs/models/supported_models.md | 1 + .../multimodal/generation/test_common.py | 29 +++++++++---------- tests/models/registry.py | 3 +- vllm/model_executor/models/registry.py | 1 + 4 files changed, 18 insertions(+), 16 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 35a5fa0c2e42f..20cf75873af76 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -629,6 +629,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ | | `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, etc. | ✅︎ | ✅︎ | ✅︎ | | `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + I<sup>E+</sup> + V<sup>E+</sup> | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ | ✅︎ | | `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ | | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ | ✅︎ | | `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ | diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index 96208f8eda628..2b60faae8ec0b 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -222,21 +222,6 @@ VLM_TEST_SETTINGS = { }, marks=[large_gpu_mark(min_gb=32)], ), - # Check "auto" with fallback to transformers - "internvl-transformers": VLMTestInfo( - models=["OpenGVLab/InternVL3-1B-hf"], - test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), - prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 - img_idx_to_prompt=lambda idx: "<IMG_CONTEXT>", - max_model_len=4096, - use_tokenizer_eos=True, - image_size_factors=[(0.25, 0.5, 1.0)], - vllm_runner_kwargs={ - "model_impl": "auto", - }, - auto_cls=AutoModelForImageTextToText, - marks=[pytest.mark.core_model], - ), #### Extended model tests "aria": VLMTestInfo( models=["rhymes-ai/Aria"], @@ -461,6 +446,20 @@ VLM_TEST_SETTINGS = { use_tokenizer_eos=True, patch_hf_runner=model_utils.internvl_patch_hf_runner, ), + "intern_vl-hf": VLMTestInfo( + models=["OpenGVLab/InternVL3-1B-hf"], + test_type=( + VLMTestType.IMAGE, + VLMTestType.MULTI_IMAGE, + VLMTestType.VIDEO, + ), + prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 + img_idx_to_prompt=lambda idx: "<IMG_CONTEXT>", + video_idx_to_prompt=lambda idx: "<video>", + max_model_len=8192, + use_tokenizer_eos=True, + auto_cls=AutoModelForImageTextToText, + ), "kimi_vl": VLMTestInfo( models=["moonshotai/Kimi-VL-A3B-Instruct"], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), diff --git a/tests/models/registry.py b/tests/models/registry.py index ee546e7af85c6..2538e71692c4e 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -429,6 +429,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { "3.5-qwen3moe": "OpenGVLab/InternVL3_5-30B-A3B", # noqa: E501 "3.5-gptoss": "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview"}, # noqa: E501 trust_remote_code=True), + "InternVLForConditionalGeneration": _HfExamplesInfo("OpenGVLab/InternVL3-1B-hf"), # noqa: E501 "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501 trust_remote_code=True), "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct", # noqa: E501 @@ -584,7 +585,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { _TRANSFORMERS_BACKEND_MODELS = { "TransformersModel": _HfExamplesInfo("Qwen/Qwen3-Embedding-0.6B"), "TransformersForCausalLM": _HfExamplesInfo("hmellor/Ilama-3.2-1B", trust_remote_code=True), # noqa: E501 - "TransformersForMultimodalLM": _HfExamplesInfo("OpenGVLab/InternVL3-1B-hf"), + "TransformersForMultimodalLM": _HfExamplesInfo("BAAI/Emu3-Chat-hf"), } _EXAMPLE_MODELS = { diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 80eac78cdfadb..02ef301a52a43 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -220,6 +220,7 @@ _MULTIMODAL_MODELS = { "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"), "InternVLChatModel": ("internvl", "InternVLChatModel"), "InternS1ForConditionalGeneration": ("interns1", "InternS1ForConditionalGeneration"), # noqa: E501 + "InternVLForConditionalGeneration": ("interns1", "InternS1ForConditionalGeneration"), # noqa: E501 "Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"), "SmolVLMForConditionalGeneration": ("smolvlm","SmolVLMForConditionalGeneration"), # noqa: E501 "KeyeForConditionalGeneration": ("keye", "KeyeForConditionalGeneration"), From 83f555f637b41a0f533fa1d37b194df6f564ac64 Mon Sep 17 00:00:00 2001 From: Didier Durand <2927957+didier-durand@users.noreply.github.com> Date: Wed, 27 Aug 2025 16:59:34 +0200 Subject: [PATCH 094/112] [Doc]: upgrade version of crate-ci tool for improved typo detection (#23755) Signed-off-by: Didier Durand <durand.didier@gmail.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 612b290e88d46..c16bdeeecd07a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -21,7 +21,7 @@ repos: - id: ruff-format files: ^(.buildkite|benchmarks|examples)/.* - repo: https://github.com/crate-ci/typos - rev: v1.34.0 + rev: v1.35.5 hooks: - id: typos - repo: https://github.com/PyCQA/isort From 3ce8285d6d96b929fddbb8d29be9ed3b81adcd75 Mon Sep 17 00:00:00 2001 From: Nick Hill <nhill@redhat.com> Date: Wed, 27 Aug 2025 08:11:33 -0700 Subject: [PATCH 095/112] [LogitsProcs] Deduplicate built-in LP implementation logic (#23362) Signed-off-by: Nick Hill <nhill@redhat.com> --- .../offline_inference/logits_processor.py | 38 ++--- tests/v1/logits_processors/utils.py | 37 ++--- vllm/v1/sample/logits_processor/builtin.py | 148 ++++++++---------- vllm/v1/sample/logits_processor/interface.py | 15 +- 4 files changed, 95 insertions(+), 143 deletions(-) diff --git a/examples/offline_inference/logits_processor.py b/examples/offline_inference/logits_processor.py index 7ef20efa7d28c..3e122319169eb 100644 --- a/examples/offline_inference/logits_processor.py +++ b/examples/offline_inference/logits_processor.py @@ -42,8 +42,8 @@ from vllm.config import VllmConfig from vllm.v1.sample.logits_processor import ( BatchUpdate, LogitsProcessor, - MoveDirectionality, ) +from vllm.v1.sample.logits_processor.builtin import process_dict_updates # Hypothetical custom logits processor @@ -53,38 +53,22 @@ class DummyLogitsProcessor(LogitsProcessor): def __init__( self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool ): - self.req_info: dict[int, SamplingParams] = {} + self.req_info: dict[int, int] = {} def is_argmax_invariant(self) -> bool: """Never impacts greedy sampling""" return False def update_state(self, batch_update: Optional[BatchUpdate]): - if not batch_update: - return - - # Process added requests. - for index, params, _, _ in batch_update.added: - assert params is not None - if params.extra_args and ( - target_token := params.extra_args.get("target_token") - ): - self.req_info[index] = target_token - - if self.req_info: - # Process removed requests. - for index in batch_update.removed: - self.req_info.pop(index, None) - - # Process moved requests, unidirectional move (a->b) and swap - # (a<->b) - for adx, bdx, direct in batch_update.moved: - a_val = self.req_info.pop(adx, None) - b_val = self.req_info.pop(bdx, None) - if a_val is not None: - self.req_info[bdx] = a_val - if direct == MoveDirectionality.SWAP and b_val is not None: - self.req_info[adx] = b_val + process_dict_updates( + self.req_info, + batch_update, + # This function returns the LP's per-request state based on the + # request details, or None if this LP does not apply to the + # request. + lambda params, _, __: params.extra_args + and (params.extra_args.get("target_token")), + ) def apply(self, logits: torch.Tensor) -> torch.Tensor: if not self.req_info: diff --git a/tests/v1/logits_processors/utils.py b/tests/v1/logits_processors/utils.py index c0bfc1a18feca..c36f1bd021c70 100644 --- a/tests/v1/logits_processors/utils.py +++ b/tests/v1/logits_processors/utils.py @@ -8,10 +8,9 @@ from typing import Optional import torch from vllm.config import VllmConfig -from vllm.sampling_params import SamplingParams from vllm.v1.sample.logits_processor import (LOGITSPROCS_GROUP, BatchUpdate, - LogitsProcessor, - MoveDirectionality) + LogitsProcessor) +from vllm.v1.sample.logits_processor.builtin import process_dict_updates MODEL_NAME = "facebook/opt-125m" POOLING_MODEL_NAME = "BAAI/bge-base-en-v1.5" @@ -45,37 +44,19 @@ class DummyLogitsProcessor(LogitsProcessor): def __init__(self, vllm_config: "VllmConfig", device: torch.device, is_pin_memory: bool): - self.req_info: dict[int, SamplingParams] = {} + self.req_info: dict[int, int] = {} def is_argmax_invariant(self) -> bool: """Never impacts greedy sampling""" return False def update_state(self, batch_update: Optional[BatchUpdate]): - if not batch_update: - return - - # Process added requests. - for index, params, _, _ in batch_update.added: - assert params is not None - if params.extra_args and (target_token := - params.extra_args.get("target_token")): - self.req_info[index] = target_token - - if self.req_info: - # Process removed requests. - for index in batch_update.removed: - self.req_info.pop(index, None) - - # Process moved requests, unidirectional move (a->b) and swap - # (a<->b) - for adx, bdx, direct in batch_update.moved: - a_val = self.req_info.pop(adx, None) - b_val = self.req_info.pop(bdx, None) - if a_val is not None: - self.req_info[bdx] = a_val - if direct == MoveDirectionality.SWAP and b_val is not None: - self.req_info[adx] = b_val + process_dict_updates( + self.req_info, + batch_update, + lambda params, _, __: params.extra_args and + (params.extra_args.get("target_token")), + ) def apply(self, logits: torch.Tensor) -> torch.Tensor: if not self.req_info: diff --git a/vllm/v1/sample/logits_processor/builtin.py b/vllm/v1/sample/logits_processor/builtin.py index 00dd757489ca0..60f9c0bdb6313 100644 --- a/vllm/v1/sample/logits_processor/builtin.py +++ b/vllm/v1/sample/logits_processor/builtin.py @@ -1,10 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING, Callable, Optional, TypeVar import torch +from vllm import SamplingParams from vllm.v1.sample.logits_processor.interface import (BatchUpdate, LogitsProcessor, MoveDirectionality) @@ -12,6 +13,8 @@ from vllm.v1.sample.logits_processor.interface import (BatchUpdate, if TYPE_CHECKING: from vllm.config import VllmConfig +T = TypeVar("T") + class MinPLogitsProcessor(LogitsProcessor): @@ -130,49 +133,15 @@ class LogitBiasLogitsProcessor(LogitsProcessor): return False def update_state(self, batch_update: Optional[BatchUpdate]): - if not batch_update: - return - - needs_update: bool = False - # Process added requests. - for index, params, _, _ in batch_update.added: - if lb := params.logit_bias: - self.biases[index] = lb - needs_update = True - else: - # Drop biases metadata at batch index - if self.biases.pop(index, None) is not None: - # If a new request replaces an old request which - # specified biases, we should update processor tensors - needs_update = True - - if self.biases: - # Process removed requests. - for index in batch_update.removed: - if self.biases.pop(index, None): - needs_update = True - - # Process moved requests, unidirectional (a->b) and swap (a<->b) - for a_index, b_index, direct in batch_update.moved: - if direct == MoveDirectionality.UNIDIRECTIONAL: - if (a_entry := self.biases.pop(a_index, None)) is None: - if self.biases.pop(b_index, None) is not None: - needs_update = True - else: - self.biases[b_index] = a_entry - needs_update = True - else: - a_entry = self.biases.pop(a_index, None) - if (b_entry := self.biases.pop(b_index, None)) is not None: - self.biases[a_index] = b_entry - needs_update = True - if a_entry is not None: - self.biases[b_index] = a_entry - needs_update = True + needs_update = process_dict_updates( + self.biases, batch_update, + lambda params, _, __: params.logit_bias or None) # Update tensors if needed. if needs_update: - reqs, tok_ids, biases = [], [], [] + reqs: list[int] = [] + tok_ids: list[int] = [] + biases: list[float] = [] for req, lb in self.biases.items(): reqs.extend([req] * len(lb)) tok_ids.extend(lb.keys()) @@ -216,52 +185,18 @@ class MinTokensLogitsProcessor(LogitsProcessor): of the argmax operation in greedy sampling.""" return False + @staticmethod + def add_request( + params: SamplingParams, _: list[int], output_tok_ids: list[int] + ) -> Optional[tuple[int, Sequence[int], set[int]]]: + min_tokens = params.min_tokens + if not min_tokens or len(output_tok_ids) >= min_tokens: + return None + return min_tokens, output_tok_ids, params.all_stop_token_ids + def update_state(self, batch_update: Optional[BatchUpdate]): - needs_update = False - - if batch_update: - # Process added requests. - for index, params, _, output_tok_ids in batch_update.added: - if ((min_tokens := params.min_tokens) - and len(output_tok_ids) < min_tokens): - # Replace request metadata at batch index - self.min_toks[index] = (min_tokens, output_tok_ids, - params.all_stop_token_ids) - needs_update = True - else: - # Drop min_toks metadata at batch index - if self.min_toks.pop(index, None) is not None: - # If a new request replaces an old request which - # specified min_toks, we should update processor tensors - needs_update = True - - if self.min_toks: - # Process removed requests. - for index in batch_update.removed: - if self.min_toks.pop(index, None): - needs_update = True - - # Process moved requests, unidirectional (a->b) and - # swapped (a<->b) - for a_index, b_index, direct in batch_update.moved: - if direct == MoveDirectionality.UNIDIRECTIONAL: - if (a_entry := self.min_toks.pop(a_index, - None)) is None: - if self.min_toks.pop(b_index, None) is not None: - needs_update = True - else: - self.min_toks[b_index] = a_entry - needs_update = True - else: - a_entry = self.min_toks.pop(a_index, None) - if (b_entry := self.min_toks.pop(b_index, - None)) is not None: - self.min_toks[a_index] = b_entry - needs_update = True - if a_entry is not None: - self.min_toks[b_index] = a_entry - needs_update = True - + needs_update = process_dict_updates(self.min_toks, batch_update, + self.add_request) if self.min_toks: # Check for any requests that have attained their min tokens. to_remove = tuple(index for index, (min_toks, out_tok_ids, @@ -295,3 +230,44 @@ class MinTokensLogitsProcessor(LogitsProcessor): # Inhibit EOS token for requests which have not reached min length logits[self.logits_slice] = -float("inf") return logits + + +def process_dict_updates( + req_entries: dict[int, T], batch_update: Optional[BatchUpdate], + new_state: Callable[[SamplingParams, list[int], list[int]], Optional[T]] +) -> bool: + """Utility function to update dict state for sparse LogitsProcessors.""" + + if not batch_update: + # Nothing to do. + return False + + updated = False + for index, params, prompt_tok_ids, output_tok_ids in batch_update.added: + if (state := new_state(params, prompt_tok_ids, + output_tok_ids)) is not None: + req_entries[index] = state + updated = True + elif req_entries.pop(index, None) is not None: + updated = True + + if req_entries: + # Process removed requests. + for index in batch_update.removed: + if req_entries.pop(index, None): + updated = True + + # Process moved requests, unidirectional (a->b) and + # swapped (a<->b) + for a_index, b_index, direct in batch_update.moved: + a_entry = req_entries.pop(a_index, None) + b_entry = req_entries.pop(b_index, None) + if a_entry is not None: + req_entries[b_index] = a_entry + updated = True + if b_entry is not None: + updated = True + if direct == MoveDirectionality.SWAP: + req_entries[a_index] = b_entry + + return updated diff --git a/vllm/v1/sample/logits_processor/interface.py b/vllm/v1/sample/logits_processor/interface.py index 12b4db24bff88..16cd00943db8d 100644 --- a/vllm/v1/sample/logits_processor/interface.py +++ b/vllm/v1/sample/logits_processor/interface.py @@ -44,10 +44,16 @@ class BatchUpdate: # Key assumption: the `output_tok_ids` list (which is an element of each # tuple in `added`) is a reference to the request's running output tokens # list; via this reference, the logits processors always see the latest - # list of generated output tokens + # list of generated output tokens. + # + # NOTE: + # * Added or moved requests may replace existing requests with the same + # index. + # * Operations should be processed in the following order: + # - removed, added, moved removed: Sequence[RemovedRequest] - moved: Sequence[MovedRequest] added: Sequence[AddedRequest] + moved: Sequence[MovedRequest] class LogitsProcessor(ABC): @@ -59,6 +65,11 @@ class LogitsProcessor(ABC): @abstractmethod def apply(self, logits: torch.Tensor) -> torch.Tensor: + """Apply LogitsProcessor to batch logits tensor. + + The updated tensor must be returned but may be + modified in-place. + """ raise NotImplementedError @abstractmethod From 2b61d2e22fbcfd6c9df9cdf06f5905b311c2ca18 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 27 Aug 2025 17:22:21 +0100 Subject: [PATCH 096/112] [Docs] Remove in-tree Gaudi install instructions (#23628) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/getting_started/installation/README.md | 1 - .../installation/intel_gaudi.md | 388 ------------------ 2 files changed, 389 deletions(-) delete mode 100644 docs/getting_started/installation/intel_gaudi.md diff --git a/docs/getting_started/installation/README.md b/docs/getting_started/installation/README.md index 0ee680f5c688c..8a658b7a9103f 100644 --- a/docs/getting_started/installation/README.md +++ b/docs/getting_started/installation/README.md @@ -12,7 +12,6 @@ vLLM supports the following hardware platforms: - [Apple silicon](cpu.md#apple-silicon) - [IBM Z (S390X)](cpu.md#ibm-z-s390x) - [Google TPU](google_tpu.md) -- [Intel Gaudi](intel_gaudi.md) - [AWS Neuron](aws_neuron.md) ## Hardware Plugins diff --git a/docs/getting_started/installation/intel_gaudi.md b/docs/getting_started/installation/intel_gaudi.md deleted file mode 100644 index ff912efec9ca8..0000000000000 --- a/docs/getting_started/installation/intel_gaudi.md +++ /dev/null @@ -1,388 +0,0 @@ -# Intel Gaudi - -This page provides instructions on running vLLM with Intel Gaudi devices. - -!!! warning - There are no pre-built wheels or images for this device, so you must build vLLM from source. - -## Requirements - -- OS: Ubuntu 22.04 LTS -- Python: 3.10 -- Intel Gaudi accelerator -- Intel Gaudi software version 1.18.0 - -Please follow the instructions provided in the -[Gaudi Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html) -to set up the execution environment. To achieve the best performance, -please follow the methods outlined in the -[Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html). - -## Configure a new environment - -### Environment verification - -To verify that the Intel Gaudi software was correctly installed, run: - -```bash -hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible -apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed -pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed -pip list | grep neural # verify that neural_compressor_pt is installed -``` - -Refer to [Intel Gaudi Software Stack Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade) -for more details. - -### Run Docker Image - -It is highly recommended to use the latest Docker image from Intel Gaudi -vault. Refer to the [Intel Gaudi documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers) -for more details. - -Use the following commands to run a Docker image: - -```bash -docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest -docker run \ - -it \ - --runtime=habana \ - -e HABANA_VISIBLE_DEVICES=all \ - -e OMPI_MCA_btl_vader_single_copy_mechanism=none \ - --cap-add=sys_nice \ - --net=host \ - --ipc=host \ - vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest -``` - -## Set up using Python - -### Pre-built wheels - -Currently, there are no pre-built Intel Gaudi wheels. - -### Build wheel from source - -To build and install vLLM from source, run: - -```bash -git clone https://github.com/vllm-project/vllm.git -cd vllm -pip install -r requirements/hpu.txt -python setup.py develop -``` - -Currently, the latest features and performance optimizations are developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and we periodically upstream them to vLLM main repo. To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following: - -```bash -git clone https://github.com/HabanaAI/vllm-fork.git -cd vllm-fork -git checkout habana_main -pip install -r requirements/hpu.txt -python setup.py develop -``` - -## Set up using Docker - -### Pre-built images - -Currently, there are no pre-built Intel Gaudi images. - -### Build image from source - -```bash -docker build -f docker/Dockerfile.hpu -t vllm-hpu-env . -docker run \ - -it \ - --runtime=habana \ - -e HABANA_VISIBLE_DEVICES=all \ - -e OMPI_MCA_btl_vader_single_copy_mechanism=none \ - --cap-add=sys_nice \ - --net=host \ - --rm vllm-hpu-env -``` - -!!! tip - If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered. - -## Extra information - -### Supported features - -- [Offline inference](../../serving/offline_inference.md) -- Online serving via [OpenAI-Compatible Server](../../serving/openai_compatible_server.md) -- HPU autodetection - no need to manually select device within vLLM -- Paged KV cache with algorithms enabled for Intel Gaudi accelerators -- Custom Intel Gaudi implementations of Paged Attention, KV cache ops, - prefill attention, Root Mean Square Layer Normalization, Rotary - Positional Encoding -- Tensor parallelism support for multi-card inference -- Inference with [HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) - for accelerating low-batch latency and throughput -- Attention with Linear Biases (ALiBi) -- INC quantization - -### Unsupported features - -- Beam search -- LoRA adapters -- AWQ quantization -- Prefill chunking (mixed-batch inferencing) - -### Supported configurations - -The following configurations have been validated to function with -Gaudi2 devices. Configurations that are not listed may or may not work. - -| Model | TP Size| dtype | Sampling | -|-------|--------|--------|----------| -| [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b) | 1, 2, 8 | BF16 | Random / Greedy | -| [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | 1, 2, 8 | BF16 | Random / Greedy | -| [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) | 1, 2, 8 | BF16 | Random / Greedy | -| [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | 1, 2, 8 | BF16 | Random / Greedy | -| [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) | 1, 2, 8 | BF16 | Random / Greedy | -| [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) | 1, 2, 8 | BF16 | Random / Greedy | -| [meta-llama/Llama-2-70b](https://huggingface.co/meta-llama/Llama-2-70b) | 8 | BF16 | Random / Greedy | -| [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 8 | BF16 | Random / Greedy | -| [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B) | 8 | BF16 | Random / Greedy | -| [meta-llama/Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) | 8 | BF16 | Random / Greedy | -| [meta-llama/Meta-Llama-3.1-70B](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B) | 8 | BF16 | Random / Greedy | -| [meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) | 8 | BF16 | Random / Greedy | - -## Performance tuning - -### Execution modes - -Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment variable), and `--enforce-eager` flag. - -| `PT_HPU_LAZY_MODE` | `enforce_eager` | execution mode | -|----------------------|-------------------|--------------------| -| 0 | 0 | torch.compile | -| 0 | 1 | PyTorch eager mode | -| 1 | 0 | HPU Graphs | - -!!! warning - In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode. - -[](){ #gaudi-bucketing-mechanism } - -### Bucketing mechanism - -Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution. -In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - `batch_size` and `sequence_length`. - -!!! note - Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase. - -Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup: - -```text -INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] -INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] -INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] -INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] -``` - -| Parameter | Description | -|----------------|-----------------------------------------------------------------------------| -| `min` | Determines the lowest value of the bucket. | -| `step` | Determines the interval between buckets. | -| `max` | Determines the upper bound of the bucket. | -| Ramp-up phase | A special handling phase applied between `min` and `step`:<br/>- `min` is multiplied by consecutive powers of two until `step` is reached.<br/>- Minimizes resource wastage for small batch sizes.<br/>- Allows larger padding for larger batches. | - -Example (with ramp-up): - -```text -min = 2, step = 32, max = 64 -=> ramp_up = (2, 4, 8, 16) -=> stable = (32, 64) -=> buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64) -``` - -Example (without ramp-up): - -```text -min = 128, step = 128, max = 512 -=> ramp_up = () -=> stable = (128, 256, 384, 512) -=> buckets = ramp_up + stable => (128, 256, 384, 512) -``` - -In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket. - -!!! warning - If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario. - -As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as `(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as `(4, 512)` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a `(2, 512)` bucket, or context length increases above 512 tokens, in which case it will become `(4, 640)` bucket. - -!!! note - Bucketing is transparent to a client -- padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests. - -### Warmup - -Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup: - -??? console "Logs" - - ```text - INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB - INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB - INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB - ... - INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB - INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB - INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB - INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB - ... - INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB - INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB - ``` - -This example uses the same buckets as in the [Bucketing Mechanism][gaudi-bucketing-mechanism] section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations. - -!!! tip - Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment. - -### HPU Graph capture - -[HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management. - -When HPU Graphs are being used, they share the common memory pool ("usable memory") as KV cache, determined by `gpu_memory_utilization` flag (`0.9` by default). -Before KV cache gets allocated, model weights are loaded onto the device, and a forward pass of the model is executed on dummy data, to estimate memory usage. -Only after that, `gpu_memory_utilization` flag is utilized - at its default value, will mark 90% of free device memory at that point as usable. -Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured. -Environment variable `VLLM_GRAPH_RESERVED_MEM` defines the ratio of memory reserved for HPU Graphs capture. -With its default value (`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 90% will be utilized for KV cache. -Environment variable `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (`VLLM_GRAPH_PROMPT_RATIO=0.3`), both stages have equal memory constraints. -Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. `VLLM_GRAPH_PROMPT_RATIO=0.2` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs. - -!!! note - `gpu_memory_utilization` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, `gpu_memory_utilization` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory. - -User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented: - -- `max_bs` - graph capture queue will be sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), default strategy for decode -- `min_tokens` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (`batch_size*sequence_length`), default strategy for prompt - -When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by `max_bs` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in `min_tokens` strategy. - -!!! note - `VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * `VLLM_GRAPH_PROMPT_RATIO`) for capturing prefill HPU Graphs, next it will attempt to do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below. - -Each described step is logged by vLLM server, as follows (negative values correspond to memory being released): - -??? console "Logs" - - ```text - INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] - INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] - INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] - INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] - INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) - INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used) - INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) - INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used) - INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache - INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0 - INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used) - INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB - ... - INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB - INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3) - INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB - ... - INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB - INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB - ... - INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB - INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB - INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB - INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB - INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB - INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)] - INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] - INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory - INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used) - ``` - -### Recommended vLLM Parameters - -- We recommend running inference on Gaudi 2 with `block_size` of 128 - for BF16 data type. Using default values (16, 32) might lead to - sub-optimal performance due to Matrix Multiplication Engine - under-utilization (see [Gaudi Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)). -- For max throughput on Llama 7B, we recommend running with batch size - of 128 or 256 and max context length of 2048 with HPU Graphs enabled. - If you encounter out-of-memory issues, see troubleshooting section. - -### Environment variables - -**Diagnostic and profiling knobs:** - -- `VLLM_PROFILER_ENABLED`: If `true`, enable the high level profiler. Resulting JSON traces can be viewed in [perfetto.habana.ai](https://perfetto.habana.ai/#!/viewer). `false` by default. -- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION`: If `true`, log graph compilations for each vLLM engine step when any occurs. Highly recommended to use with `PT_HPU_METRICS_GC_DETAILS=1`. `false` by default. -- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: If `true`, always log graph compilations for each vLLM engine step even if none occurred. `false` by default. -- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: If `true`, log CPU fallbacks for each vLLM engine step when any occurs. `false` by default. -- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true`, always log CPU fallbacks for each vLLM engine step even if none occurred. `false` by default. - -**Performance tuning knobs:** - -- `VLLM_SKIP_WARMUP`: if `true`, warmup will be skipped, `false` by default - -- `VLLM_GRAPH_RESERVED_MEM`: percentage of memory dedicated for HPUGraph capture, `0.1` by default - -- `VLLM_GRAPH_PROMPT_RATIO`: percentage of reserved graph memory dedicated for prompt graphs, `0.3` by default - -- `VLLM_GRAPH_PROMPT_STRATEGY`: strategy determining order of prompt graph capture, `min_tokens` or `max_bs`, `min_tokens` by default - -- `VLLM_GRAPH_DECODE_STRATEGY`: strategy determining order of decode graph capture, `min_tokens` or `max_bs`, `max_bs` by default - -- `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment variables configuring ranges of bucketing mechanism - - - `{phase}` is either `PROMPT` or `DECODE` - - - `{dim}` is either `BS`, `SEQ` or `BLOCK` - - - `{param}` is either `MIN`, `STEP` or `MAX` - - - Default values: - -| `{phase}` | Parameter | Env Variable | Value Expression | -|-----------|-----------|--------------|------------------| -| Prompt | Batch size min | `VLLM_PROMPT_BS_BUCKET_MIN` | `1` | -| Prompt | Batch size step | `VLLM_PROMPT_BS_BUCKET_STEP` | `min(max_num_seqs, 32)` | -| Prompt | Batch size max | `VLLM_PROMPT_BS_BUCKET_MAX` | `min(max_num_seqs, 64)` | -| Prompt | Sequence length min | `VLLM_PROMPT_SEQ_BUCKET_MIN` | `block_size` | -| Prompt | Sequence length step | `VLLM_PROMPT_SEQ_BUCKET_STEP` | `block_size` | -| Prompt | Sequence length max | `VLLM_PROMPT_SEQ_BUCKET_MAX` | `max_model_len` | -| Decode | Batch size min | `VLLM_DECODE_BS_BUCKET_MIN` | `1` | -| Decode | Batch size step | `VLLM_DECODE_BS_BUCKET_STEP` | `min(max_num_seqs, 32)` | -| Decode | Batch size max | `VLLM_DECODE_BS_BUCKET_MAX` | `max_num_seqs` | -| Decode | Sequence length min | `VLLM_DECODE_BLOCK_BUCKET_MIN` | `block_size` | -| Decode | Sequence length step | `VLLM_DECODE_BLOCK_BUCKET_STEP` | `block_size` | -| Decode | Sequence length max | `VLLM_DECODE_BLOCK_BUCKET_MAX` | `max(128, (max_num_seqs*max_model_len)/block_size)` | - -Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution: - -- `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used; if `1`, PyTorch Lazy backend for Gaudi will be used. `1` is default. -- `PT_HPU_ENABLE_LAZY_COLLECTIVES`: required to be `true` for tensor parallel inference with HPU Graphs - -## Troubleshooting: tweaking HPU graphs - -If you experience device out-of-memory issues or want to attempt -inference at higher batch sizes, try tweaking HPU Graphs by following -the below: - -- Tweak `gpu_memory_utilization` knob. It will decrease the - allocation of KV cache, leaving some headroom for capturing graphs - with larger batch size. By default `gpu_memory_utilization` is set - to 0.9. It attempts to allocate ~90% of HBM left for KV cache after - short profiling run. Note that decreasing reduces the number of KV - cache blocks you have available, and therefore reduces the effective - maximum number of tokens you can handle at a given time. -- If this method is not efficient, you can disable `HPUGraph` - completely. With HPU Graphs disabled, you are trading latency and - throughput at lower batches for potentially higher throughput on - higher batches. You can do that by adding `--enforce-eager` flag to - server (for online serving), or by passing `enforce_eager=True` - argument to LLM constructor (for offline inference). From 4f35be10a96feeca0328d3ab8d359e1eaae5c23d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luka=20Govedi=C4=8D?= <ProExpertProg@users.noreply.github.com> Date: Wed, 27 Aug 2025 12:47:28 -0400 Subject: [PATCH 097/112] [BugFix] Fix topk_softmax assert (#19764) Signed-off-by: Luka Govedic <lgovedic@redhat.com> --- csrc/moe/topk_softmax_kernels.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu index 99c52ef17d08b..cd80bfda7dfde 100644 --- a/csrc/moe/topk_softmax_kernels.cu +++ b/csrc/moe/topk_softmax_kernels.cu @@ -573,7 +573,7 @@ void topk_softmax( stream); } else { - assert(topk_indices.scalar_type() == at::ScalarType::Int64); + TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long); vllm::moe::topkGatingSoftmaxKernelLauncher( gating_output.data_ptr<float>(), topk_weights.data_ptr<float>(), From 52883ed08461943ff55d5dd3cf12a28c00902fa7 Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Thu, 28 Aug 2025 01:01:50 +0800 Subject: [PATCH 098/112] [Model] Merge `SupportsMultiModalWithRawInput` with `SupportsMultiModal` (#23749) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- vllm/config/__init__.py | 8 ++-- vllm/model_executor/models/interfaces.py | 45 +++++-------------- .../models/prithvi_geospatial_mae.py | 6 +-- vllm/model_executor/models/registry.py | 11 ++--- vllm/v1/worker/gpu_model_runner.py | 10 +++-- 5 files changed, 30 insertions(+), 50 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index e3fb6d796def5..351833d3f02d0 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -1698,6 +1698,10 @@ class ModelConfig: def is_multimodal_model(self) -> bool: return self.multimodal_config is not None + @property + def is_multimodal_raw_input_only_model(self) -> bool: + return self._model_info.supports_multimodal_raw_input_only + @property def is_cross_encoder(self) -> bool: return (self._model_info.supports_cross_encoding @@ -1707,10 +1711,6 @@ class ModelConfig: def is_pp_supported(self) -> bool: return self._model_info.supports_pp - @property - def is_multimodal_raw_input_supported(self) -> bool: - return self._model_info.supports_multimodal_raw_input - @property def is_attention_free(self) -> bool: return self._model_info.is_attention_free diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 506732fed3614..2ee966fb5c0c8 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -52,6 +52,12 @@ class SupportsMultiModal(Protocol): MRO of your model class. """ + supports_multimodal_raw_input_only: ClassVar[bool] = False + """ + A flag that indicates this model supports multi-modal inputs and processes + them in their raw form and not embeddings. + """ + supports_encoder_tp_data: ClassVar[bool] = False """ A flag that indicates whether this model supports @@ -143,45 +149,16 @@ def supports_multimodal( return getattr(model, "supports_multimodal", False) +def supports_multimodal_raw_input_only( + model: Union[type[object], object]) -> bool: + return getattr(model, "supports_multimodal_raw_input_only", False) + + def supports_multimodal_encoder_tp_data( model: Union[type[object], object]) -> bool: return getattr(model, "supports_encoder_tp_data", False) -@runtime_checkable -class SupportsMultiModalWithRawInput(SupportsMultiModal, Protocol): - """The interface required for all multi-modal models.""" - - supports_multimodal_raw_input: ClassVar[Literal[True]] = True - """ - A flag that indicates this model supports multi-modal inputs and processes - them in their raw form and not embeddings. - - Note: - There is no need to redefine this flag if this class is in the - MRO of your model class. - """ - - -@overload -def supports_multimodal_raw_input( - model: object) -> TypeIs[SupportsMultiModalWithRawInput]: - ... - - -@overload -def supports_multimodal_raw_input( - model: type[object]) -> TypeIs[type[SupportsMultiModalWithRawInput]]: - ... - - -def supports_multimodal_raw_input( - model: Union[type[object], object] -) -> Union[TypeIs[type[SupportsMultiModalWithRawInput]], - TypeIs[SupportsMultiModalWithRawInput]]: - return getattr(model, "supports_multimodal_raw_input", False) - - @runtime_checkable class SupportsScoreTemplate(Protocol): """The interface required for all models that support score template.""" diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py index f46d6375e1f61..2d14fe6d5892f 100644 --- a/vllm/model_executor/models/prithvi_geospatial_mae.py +++ b/vllm/model_executor/models/prithvi_geospatial_mae.py @@ -41,7 +41,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from .interfaces import (IsAttentionFree, MultiModalEmbeddings, - SupportsMultiModalWithRawInput) + SupportsMultiModal) from .interfaces_base import default_pooling_type @@ -174,10 +174,10 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor): info=PrithviGeoSpatialMAEProcessingInfo, dummy_inputs=PrithviGeoSpatialMAEInputBuilder, ) -class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree, - SupportsMultiModalWithRawInput): +class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree, SupportsMultiModal): """Prithvi Masked Autoencoder""" + supports_multimodal_raw_input_only = True is_pooling_model = True @classmethod diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 02ef301a52a43..12c0c77784db8 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -29,7 +29,7 @@ from .interfaces import (has_inner_state, has_noops, is_attention_free, is_hybrid, supports_cross_encoding, supports_multimodal, supports_multimodal_encoder_tp_data, - supports_multimodal_raw_input, supports_pp, + supports_multimodal_raw_input_only, supports_pp, supports_transcription, supports_v0_only) from .interfaces_base import (get_default_pooling_type, is_pooling_model, is_text_generation_model) @@ -326,7 +326,7 @@ class _ModelInfo: default_pooling_type: str supports_cross_encoding: bool supports_multimodal: bool - supports_multimodal_raw_input: bool + supports_multimodal_raw_input_only: bool supports_multimodal_encoder_tp_data: bool supports_pp: bool has_inner_state: bool @@ -346,7 +346,8 @@ class _ModelInfo: default_pooling_type=get_default_pooling_type(model), supports_cross_encoding=supports_cross_encoding(model), supports_multimodal=supports_multimodal(model), - supports_multimodal_raw_input=supports_multimodal_raw_input(model), + supports_multimodal_raw_input_only= + supports_multimodal_raw_input_only(model), supports_multimodal_encoder_tp_data= supports_multimodal_encoder_tp_data(model), supports_pp=supports_pp(model), @@ -743,13 +744,13 @@ class _ModelRegistry: model_cls, _ = self.inspect_model_cls(architectures, model_config) return model_cls.supports_multimodal - def supports_multimodal_raw_input( + def is_multimodal_raw_input_only_model( self, architectures: Union[str, list[str]], model_config: ModelConfig, ) -> bool: model_cls, _ = self.inspect_model_cls(architectures, model_config) - return model_cls.supports_multimodal_raw_input + return model_cls.supports_multimodal_raw_input_only def is_pp_supported_model( self, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d93460d618e7c..20d2d20ba0967 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -139,8 +139,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): cache_config.cache_dtype] self.is_pooling_model = model_config.pooler_config is not None - self.is_multimodal_raw_input_supported = ( - model_config.is_multimodal_raw_input_supported) + self.is_multimodal_raw_input_only_model = ( + model_config.is_multimodal_raw_input_only_model) + self.max_model_len = model_config.max_model_len self.max_num_tokens = scheduler_config.max_num_batched_tokens self.max_num_reqs = scheduler_config.max_num_seqs @@ -612,7 +613,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self, scheduler_output: "SchedulerOutput", ) -> BatchedTensorInputs: - if not self.is_multimodal_raw_input_supported or not scheduler_output: # noqa: SIM102 + if not scheduler_output or not self.is_multimodal_raw_input_only_model: return {} mm_kwargs = list[MultiModalKwargsItem]() @@ -631,8 +632,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): return mm_kwargs_combined def _dummy_mm_kwargs(self, num_seqs: int) -> BatchedTensorInputs: - if not self.is_multimodal_raw_input_supported: + if not self.is_multimodal_raw_input_only_model: return {} + mm_budget = self.mm_budget assert mm_budget is not None From dd589322801e2eb8426aa2b95f2729699ff431c5 Mon Sep 17 00:00:00 2001 From: Thomas Parnell <tpa@zurich.ibm.com> Date: Wed, 27 Aug 2025 19:05:16 +0200 Subject: [PATCH 099/112] [V1] [Hybrid] Enable compile and piecewise CUDA graph for MiniMax-Text models (#22589) Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> --- vllm/config/compilation.py | 1 + vllm/model_executor/models/minimax_text_01.py | 234 ++++++++---------- 2 files changed, 98 insertions(+), 137 deletions(-) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 56aa00a30d3ae..5c3b220016360 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -339,6 +339,7 @@ class CompilationConfig: "vllm.mamba_mixer2", "vllm.mamba_mixer", "vllm.short_conv", + "vllm.linear_attention", ] def compute_hash(self) -> str: diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index 0e854bd7d913d..176a40179bcac 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Inference-only MiniMaxText01 model.""" -import copy import math from collections.abc import Iterable from typing import TYPE_CHECKING, Optional, Union @@ -19,13 +18,14 @@ from transformers import MiniMaxConfig from vllm import envs from vllm.attention import Attention, AttentionMetadata +from vllm.compilation.decorators import support_torch_compile from vllm.config import (CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config) from vllm.distributed.communication_op import tensor_model_parallel_all_reduce from vllm.distributed.parallel_state import ( get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) -from vllm.forward_context import get_forward_context +from vllm.forward_context import ForwardContext, get_forward_context from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import FusedMoE @@ -43,12 +43,15 @@ from vllm.model_executor.layers.mamba.mamba_utils import ( MambaStateDtypeCalculator, MambaStateShapeCalculator) from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.utils import maybe_prefix from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors +from vllm.utils import direct_register_custom_op from vllm.v1.attention.backends.linear_attn import LinearAttentionMetadata from .interfaces import HasInnerState, IsHybrid @@ -143,61 +146,6 @@ class MiniMaxText01RMSNormTP(CustomOp): return self._forward(x) -class MiniMaxText01RotaryEmbedding(CustomOp): - name = "MiniMaxText01RotaryEmbedding" - - def __init__( - self, - head_size: int, - rotary_dim: int, - max_position: int, - base: float, - is_neox_style: bool, - cache_dtype: torch.dtype, - ) -> None: - super().__init__() - self.head_size = head_size - self.rotary_dim = rotary_dim - self.max_position_embeddings = max_position - self.base = base - self.is_neox_style = is_neox_style - self.cache_dtype = cache_dtype - cache = self._compute_cos_sin_cache().to(cache_dtype) - self.register_buffer("cos_sin_cache", cache, persistent=False) - - def _compute_inv_freq(self, base: float) -> torch.Tensor: - """Compute the inverse frequency.""" - inv_freq = 1.0 / (base**(torch.arange( - 0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim)) - return inv_freq - - def _compute_cos_sin_cache(self) -> torch.Tensor: - """Compute the cos and sin cache.""" - inv_freq = self._compute_inv_freq(self.base) - t = torch.arange(self.max_position_embeddings, dtype=torch.float) - freqs = torch.einsum("i,j -> ij", t, inv_freq) - cos = freqs.cos() - sin = freqs.sin() - cache = torch.cat((cos, sin), dim=-1) - return cache - - def forward( - self, - positions: torch.Tensor, - query: torch.Tensor, - key: torch.Tensor, - ) -> tuple[torch.Tensor, torch.Tensor]: - from vllm import _custom_ops as ops - self.cos_sin_cache = self.cos_sin_cache.to(positions.device) - query_cast = query.to(self.cache_dtype) - key_cast = key.to(self.cache_dtype) - ops.rotary_embedding(positions, query_cast, key_cast, self.head_size, - self.cos_sin_cache, self.is_neox_style) - query = query_cast.to(query.dtype) - key = key_cast.to(key.dtype) - return query, key - - class MiniMaxText01MLP(nn.Module): def __init__( @@ -526,20 +474,40 @@ class MiniMaxText01LinearAttention(nn.Module, MambaBase): slot_id, 32) return hidden - def forward(self, hidden_states: torch.Tensor, positions: torch.Tensor, - kv_caches: MinimaxCacheParams, **kwargs) -> torch.Tensor: - qkv, _ = self.qkv_proj(hidden_states) + def forward(self, hidden_states: torch.Tensor, output: torch.Tensor, + positions: torch.Tensor, + kv_caches: MinimaxCacheParams) -> None: + if not envs.VLLM_USE_V1: + self._forward(hidden_states, output, positions, kv_caches) + else: + torch.ops.vllm.linear_attention( + hidden_states, + output, + positions, + self.prefix, + ) + + def _forward(self, hidden_states: torch.Tensor, output: torch.Tensor, + positions: torch.Tensor, + kv_caches: Optional[MinimaxCacheParams]) -> None: + forward_context = get_forward_context() + attn_metadata: AttentionMetadata = forward_context.attn_metadata + if envs.VLLM_USE_V1 and attn_metadata is not None: + assert isinstance(attn_metadata, dict) + attn_metadata = attn_metadata[self.prefix] + assert isinstance(attn_metadata, LinearAttentionMetadata) + num_actual_tokens = attn_metadata.num_prefill_tokens + \ + attn_metadata.num_decode_tokens + else: + num_actual_tokens = hidden_states.shape[0] + + qkv, _ = self.qkv_proj(hidden_states[:num_actual_tokens]) qkv32 = qkv.to(torch.float32) qkvact = torch.nn.functional.silu(qkv32) qkvact = qkvact.view((qkv.shape[0], self.tp_heads, -1)) q, k, v = torch.split(qkvact, [self.head_dim] * 3, dim=-1) - forward_context = get_forward_context() - attn_metadata = forward_context.attn_metadata if envs.VLLM_USE_V1: if attn_metadata is not None: - assert isinstance(attn_metadata, dict) - attn_metadata = attn_metadata[self.prefix] - assert isinstance(attn_metadata, LinearAttentionMetadata) kv_cache = self.kv_cache[forward_context.virtual_engine][0] state_indices_tensor = attn_metadata.state_indices_tensor @@ -578,13 +546,11 @@ class MiniMaxText01LinearAttention(nn.Module, MambaBase): hidden = self._decode_infer(q, k, v, kv_cache, state_indices_tensor, attn_metadata) - hidden = self.norm._forward(hidden) - gate, _ = self.output_gate(hidden_states) + gate, _ = self.output_gate(hidden_states[:num_actual_tokens]) hidden = F.sigmoid(gate) * hidden hidden = hidden.to(hidden_states.dtype) - hidden, _ = self.out_proj(hidden) - return hidden + output[:num_actual_tokens], _ = self.out_proj(hidden) class MiniMaxText01Attention(nn.Module): @@ -652,23 +618,23 @@ class MiniMaxText01Attention(nn.Module): quant_config=quant_config, prefix=f"{prefix}.attn", ) + self.rotary_emb = get_rope( + head_size=self.head_dim, + rotary_dim=rotary_dim, + max_position=max_position, + base=int(rope_theta), + is_neox_style=True, + dtype=torch.float32, + ) return - def forward(self, hidden_states: torch.Tensor, positions: torch.Tensor, - **kwargs) -> torch.Tensor: - forward_context = get_forward_context() - attn_metadata = forward_context.attn_metadata + def forward(self, hidden_states: torch.Tensor, output: torch.Tensor, + positions: torch.Tensor, **kwargs) -> None: qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - if envs.VLLM_USE_V1: - if attn_metadata is not None: - q, k = attn_metadata[f"{self.prefix}.attn"].rotary_emb( - positions, q, k) - else: - q, k = attn_metadata.rotary_emb(positions, q, k) + q, k = self.rotary_emb(positions, q, k) attn_output = self.attn(q, k, v) - output, _ = self.o_proj(attn_output) - return output + output[:], _ = self.o_proj(attn_output) class MiniMaxText01DecoderLayer(nn.Module): @@ -816,16 +782,15 @@ class MiniMaxText01DecoderLayer(nn.Module): is_warmup: bool = False, **kwargs) -> tuple[torch.Tensor, torch.Tensor]: - forward_context = get_forward_context() - attn_metadata = forward_context.attn_metadata layernorm_input = hidden_states layernorm_output = self.input_layernorm(layernorm_input) residual = layernorm_output if self.postnorm else layernorm_input - self_attention_output = self.self_attn( + self_attention_output = torch.empty_like(layernorm_output) + self.self_attn( hidden_states=layernorm_output, + output=self_attention_output, positions=positions, kv_caches=kv_caches, - attn_metadata=attn_metadata, ) residual = residual * self.layernorm_attention_alpha @@ -839,8 +804,8 @@ class MiniMaxText01DecoderLayer(nn.Module): if self.expert_num == 1: hidden_states = self.mlp(layernorm_output) else: - moe_hidden_states = self.block_sparse_moe( - copy.deepcopy(layernorm_output)) + moe_layernorm_output = layernorm_output.clone() + moe_hidden_states = self.block_sparse_moe(moe_layernorm_output) if self.shared_moe: before_moe_dtype = layernorm_output.dtype moe_hidden_fp32 = moe_hidden_states.to(torch.float32) @@ -878,18 +843,16 @@ class MiniMaxText01DecoderLayer(nn.Module): return +@support_torch_compile class MiniMaxText01Model(nn.Module): - def __init__( - self, - config: MiniMaxConfig, - model_config: Optional[ModelConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - cache_config: Optional[CacheConfig] = None, - scheduler_config=None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + config: MiniMaxConfig = vllm_config.model_config.hf_config + model_config = vllm_config.model_config + quant_config = vllm_config.quant_config + cache_config = vllm_config.cache_config + scheduler_config = vllm_config.scheduler_config self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size @@ -976,24 +939,6 @@ class MiniMaxText01Model(nn.Module): self.minimax_cache = MinimaxCacheManager( dtype=torch.float32, cache_shape=self.cache_shape) - rope_theta = getattr(config, "rope_theta", 10000) - head_dim = getattr(config, "head_dim", None) - if head_dim is None: - head_dim = config.hidden_size // config.num_attention_heads - if hasattr(config, "max_model_len") and isinstance( - config.max_model_len, int): - max_position_embeddings = min(config.max_position_embeddings, - config.max_model_len) - self.rotary_emb = MiniMaxText01RotaryEmbedding( - head_dim, - rotary_dim=config.rotary_dim - if hasattr(config, "rotary_dim") else head_dim, - max_position=max_position_embeddings, - base=int(rope_theta), - is_neox_style=True, - cache_dtype=torch.float32, - ) - norm_kwargs = {} if hasattr(config, "rms_norm_eps"): norm_kwargs["eps"] = config.rms_norm_eps @@ -1043,12 +988,11 @@ class MiniMaxText01Model(nn.Module): attn_metadata = forward_context.attn_metadata if not envs.VLLM_USE_V1 and attn_metadata is None: return None - if "request_ids_to_seq_ids" not in kwargs: - kwargs["request_ids_to_seq_ids"] = {} - if "finished_requests_ids" not in kwargs: - kwargs["finished_requests_ids"] = [] - if not envs.VLLM_USE_V1: + if "request_ids_to_seq_ids" not in kwargs: + kwargs["request_ids_to_seq_ids"] = {} + if "finished_requests_ids" not in kwargs: + kwargs["finished_requests_ids"] = [] ( minimax_cache_tensors, state_indices_tensor, @@ -1077,16 +1021,6 @@ class MiniMaxText01Model(nn.Module): for i in range(self.start_layer, self.end_layer): layer = self.layers[i] - if attn_metadata is not None: - # TODO (tdoublep): this whole thing with the rotary_emb is - # weird. we shouldn't be passing it via attn_metadata imo. - if envs.VLLM_USE_V1: - if isinstance(layer.self_attn, MiniMaxText01Attention): - attn_metadata[layer.prefix + - ".attn"].rotary_emb = self.rotary_emb - else: - attn_metadata.rotary_emb = self.rotary_emb - _caches = None if not envs.VLLM_USE_V1 and isinstance( layer.self_attn, MiniMaxText01LinearAttention): @@ -1120,7 +1054,6 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid): super().__init__() config = vllm_config.model_config.hf_config - quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config self.config = config self.lora_config = lora_config @@ -1133,13 +1066,8 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid): self.unpadded_vocab_size = self.config.vocab_size if hasattr(vllm_config.model_config, "max_model_len"): self.config.max_model_len = vllm_config.model_config.max_model_len - self.model = MiniMaxText01Model( - self.config, - model_config=vllm_config.model_config, - cache_config=vllm_config.cache_config, - quant_config=quant_config, - scheduler_config=vllm_config.scheduler_config, - prefix=maybe_prefix(prefix, "model")) + self.model = MiniMaxText01Model(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) if get_pp_group().is_last_rank: self.lm_head = ParallelLMHead( self.unpadded_vocab_size, @@ -1469,3 +1397,35 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid): tp_size=parallel_config.tensor_parallel_size, head_dim=hf_config.head_dim, ) + + +def linear_attention( + hidden_states: torch.Tensor, + output: torch.Tensor, + positions: torch.Tensor, + layer_name: str, +) -> None: + forward_context: ForwardContext = get_forward_context() + self = forward_context.no_compile_layers[layer_name] + self._forward(hidden_states=hidden_states, + output=output, + positions=positions, + kv_caches=None) + + +def linear_attention_fake( + hidden_states: torch.Tensor, + output: torch.Tensor, + positions: torch.Tensor, + layer_name: str, +) -> None: + return + + +direct_register_custom_op( + op_name="linear_attention", + op_func=linear_attention, + mutates_args=["output"], + fake_impl=linear_attention_fake, + dispatch_key=current_platform.dispatch_key, +) From 4e4d017b6f70c729e7c78f74e4328a4ebca7b8ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hyogeun=20Oh=20=28=EC=98=A4=ED=9A=A8=EA=B7=BC=29?= <ohg3417@gmail.com> Date: Thu, 28 Aug 2025 02:17:29 +0900 Subject: [PATCH 100/112] [Docs] Fix warnings in `mkdocs build` (continued) (#23743) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Zerohertz <ohg3417@gmail.com> Signed-off-by: Hyogeun Oh (오효근) <ohg3417@gmail.com> --- vllm/core/block/naive_block.py | 2 +- vllm/core/block/prefix_caching_block.py | 2 +- vllm/core/scheduler.py | 2 +- vllm/v1/attention/backends/cpu_attn.py | 3 ++- vllm/v1/attention/backends/flash_attn.py | 3 ++- vllm/v1/attention/backends/flashinfer.py | 8 +++----- vllm/v1/attention/backends/flex_attention.py | 3 ++- vllm/v1/attention/backends/pallas.py | 5 +++-- vllm/v1/attention/backends/rocm_aiter_fa.py | 3 ++- vllm/v1/attention/backends/tree_attn.py | 3 ++- vllm/v1/attention/backends/triton_attn.py | 3 ++- vllm/v1/attention/backends/xformers.py | 3 ++- vllm/v1/core/encoder_cache_manager.py | 8 ++++---- vllm/v1/core/kv_cache_coordinator.py | 3 ++- vllm/v1/core/kv_cache_manager.py | 11 ++++++----- vllm/v1/executor/ray_distributed_executor.py | 3 ++- vllm/v1/metrics/prometheus.py | 2 +- vllm/v1/sample/logits_processor/interface.py | 4 ++-- vllm/v1/sample/rejection_sampler.py | 2 +- vllm/v1/sample/tpu/sampler.py | 2 +- vllm/v1/structured_output/backend_types.py | 4 ++-- vllm/v1/worker/gpu_input_batch.py | 3 --- vllm/v1/worker/gpu_model_runner.py | 2 +- vllm/v1/worker/tpu_model_runner.py | 10 +++++----- vllm/v1/worker/utils.py | 8 ++++---- vllm/v1/worker/worker_base.py | 4 ++-- 26 files changed, 56 insertions(+), 50 deletions(-) diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py index dae6ead04e9c9..7d9b32cd4b674 100644 --- a/vllm/core/block/naive_block.py +++ b/vllm/core/block/naive_block.py @@ -207,7 +207,7 @@ class NaiveBlockAllocator(BlockAllocator): Args: absolute_id (int): The absolute block id for the block - in whole allocator. + in whole allocator. Returns: int: The zero-offset block id on certain device. diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index 2913a01bf34a5..a21d69323abbc 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -61,7 +61,7 @@ class PrefixCachingBlockAllocator(BlockAllocator): Args: num_blocks (int): The total number of blocks to manage. block_size (int): The size of each block in tokens. - block_ids(Optional[Iterable[int]], optional): An optional iterable of + block_ids (Optional[Iterable[int]], optional): An optional iterable of block IDs. If not provided, block IDs will be assigned sequentially from 0 to num_blocks - 1. """ diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 63894e7f5dc8b..c89f3f6632642 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -657,7 +657,7 @@ class Scheduler: `budget.num_batched_tokens` has not enough capacity to schedule all tokens. partial_prefill_metadata: information about the partial prefills - that are currently running + that are currently running Returns: SchedulerRunningOutputs. diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py index 973979fdf7dfd..ced8234a7b433 100644 --- a/vllm/v1/attention/backends/cpu_attn.py +++ b/vllm/v1/attention/backends/cpu_attn.py @@ -491,7 +491,8 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]): query: shape = [num_tokens, num_heads * head_size] key: shape = [num_tokens, num_kv_heads * head_size] value: shape = [num_tokens, num_kv_heads * head_size] - kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size] + kv_cache: shape = + [2, num_blocks, block_size * num_kv_heads * head_size] NOTE: kv_cache will be an empty tensor with shape [0] for profiling run. attn_metadata: Metadata for attention. diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 6e7096de924ca..dd2b956d4fa3d 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -438,7 +438,8 @@ class FlashAttentionImpl(AttentionImpl): query: shape = [num_tokens, num_heads, head_size] key: shape = [num_tokens, num_kv_heads, head_size] value: shape = [num_tokens, num_kv_heads, head_size] - kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size] + kv_cache: shape = + [2, num_blocks, block_size, num_kv_heads, head_size] attn_metadata: Metadata for attention. Returns: shape = [num_tokens, num_heads * head_size] diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 1115fc606b055..70d3471a47259 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -637,11 +637,9 @@ class FlashInferImpl(AttentionImpl): query: shape = [num_tokens, num_heads, head_size] key: shape = [num_tokens, num_kv_heads, head_size] value: shape = [num_tokens, num_kv_heads, head_size] - kv_cache: shape - - # NHD: [num_blocks, 2, block_size, num_kv_heads, head_size] - # HND: [num_blocks, 2, num_kv_heads, block_size, head_size] - - + kv_cache: KV cache tensor with different possible shapes: + - NHD: [num_blocks, 2, block_size, num_kv_heads, head_size] + - HND: [num_blocks, 2, num_kv_heads, block_size, head_size] attn_metadata: Metadata for attention. Returns: shape = [num_tokens, num_heads * head_size] diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py index 458562ebc8d27..a596f6b2b32a4 100644 --- a/vllm/v1/attention/backends/flex_attention.py +++ b/vllm/v1/attention/backends/flex_attention.py @@ -689,7 +689,8 @@ class FlexAttentionImpl(AttentionImpl): query: shape = [num_tokens, num_heads, head_size] key: shape = [num_tokens, num_kv_heads, head_size] value: shape = [num_tokens, num_kv_heads, head_size] - kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size] + kv_cache: shape = + [2, num_blocks, block_size, num_kv_heads, head_size] attn_metadata: Metadata for attention. Returns: shape = [num_tokens, num_heads * head_size] diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py index fd97db0abb84f..26f9abf13d0ed 100644 --- a/vllm/v1/attention/backends/pallas.py +++ b/vllm/v1/attention/backends/pallas.py @@ -235,7 +235,8 @@ class PallasAttentionBackendImpl(AttentionImpl): query: shape = [num_tokens, num_heads * head_size] key: shape = [num_tokens, num_kv_heads * head_size] value: shape = [num_tokens, num_kv_heads * head_size] - kv_cache = [num_blocks, block_size, num_kv_heads * 2, head_size] + kv_cache: shape = + [num_blocks, block_size, num_kv_heads * 2, head_size] attn_metadata: Metadata for attention. Returns: shape = [num_tokens, num_heads * head_size] @@ -329,7 +330,7 @@ def write_to_kv_cache( Args: key: shape = [num_tokens, num_kv_heads, head_size] value: shape = [num_tokens, num_kv_heads, head_size] - kv_cache = [num_blocks, block_size, num_kv_heads * 2, head_size] + kv_cache: shape = [num_blocks, block_size, num_kv_heads * 2, head_size] num_slices_per_kv_cache_update_block: int """ _, page_size, num_combined_kv_heads, head_size = kv_cache.shape diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index 403ad8e88a958..173a0a255e491 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -429,7 +429,8 @@ class AiterFlashAttentionImpl(AttentionImpl): query: shape = [num_tokens, num_heads, head_size] key: shape = [num_tokens, num_kv_heads, head_size] value: shape = [num_tokens, num_kv_heads, head_size] - kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size] + kv_cache: shape = + [2, num_blocks, block_size, num_kv_heads, head_size] attn_metadata: Metadata for attention. Returns: shape = [num_tokens, num_heads * head_size] diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py index c93223a340839..b96d957a150b5 100644 --- a/vllm/v1/attention/backends/tree_attn.py +++ b/vllm/v1/attention/backends/tree_attn.py @@ -362,7 +362,8 @@ class TreeAttentionImpl(AttentionImpl): query: shape = [num_tokens, num_heads, head_size] key: shape = [num_tokens, num_kv_heads, head_size] value: shape = [num_tokens, num_kv_heads, head_size] - kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size] + kv_cache: shape = + [2, num_blocks, block_size, num_kv_heads, head_size] attn_metadata: Metadata for attention. Returns: shape = [num_tokens, num_heads * head_size] diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index b12036c599799..a37a7f6811ef9 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -285,7 +285,8 @@ class TritonAttentionImpl(AttentionImpl): query: shape = [num_tokens, num_heads, head_size] key: shape = [num_tokens, num_kv_heads, head_size] value: shape = [num_tokens, num_kv_heads, head_size] - kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size] + kv_cache: shape = + [2, num_blocks, block_size, num_kv_heads, head_size] attn_metadata: Metadata for attention. Returns: shape = [num_tokens, num_heads * head_size] diff --git a/vllm/v1/attention/backends/xformers.py b/vllm/v1/attention/backends/xformers.py index e0eb7d8be9746..7f888c1135743 100644 --- a/vllm/v1/attention/backends/xformers.py +++ b/vllm/v1/attention/backends/xformers.py @@ -330,7 +330,8 @@ class XFormersAttentionImpl(AttentionImpl): query: shape = [num_tokens, num_heads, head_size] key: shape = [num_tokens, num_kv_heads, head_size] value: shape = [num_tokens, num_kv_heads, head_size] - kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size] + kv_cache: shape = + [2, num_blocks, block_size, num_kv_heads, head_size] attn_metadata: Metadata for attention. Returns: shape = [num_tokens, num_heads * head_size] diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py index c9d18033a1988..bd2ec036834b2 100644 --- a/vllm/v1/core/encoder_cache_manager.py +++ b/vllm/v1/core/encoder_cache_manager.py @@ -255,9 +255,9 @@ def compute_encoder_budget( Returns: - Compute budget for encoder execution, measured in number of tokens - from the input sequence. + from the input sequence. - Space budget for encoder cache size, measured in number of tokens - from the input sequence. + from the input sequence. """ if mm_registry.supports_multimodal_inputs(model_config): max_tokens_by_modality = mm_registry \ @@ -303,9 +303,9 @@ def compute_mm_encoder_budget( Returns: - Compute budget for encoder execution, measured in number of tokens - from the input sequence. + from the input sequence. - Space budget for encoder cache size, measured in number of tokens - from the input sequence. + from the input sequence. """ if not max_tokens_by_modality: diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py index f082ad00f2e35..9421341f990c8 100644 --- a/vllm/v1/core/kv_cache_coordinator.py +++ b/vllm/v1/core/kv_cache_coordinator.py @@ -119,7 +119,8 @@ class KVCacheCoordinator(ABC): Args: request: The request. - num_tokens: The total number of tokens that need to be cached + num_computed_tokens: The total number of tokens + that need to be cached (including tokens that are already cached). """ for manager in self.single_type_managers: diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index b427a9c497fef..87a11fe58a048 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -54,14 +54,15 @@ class KVCacheBlocks: def get_block_ids( self, allow_none: bool = False, - ): + ) -> Optional[tuple[list[int], ...]]: """ Converts the KVCacheBlocks instance to block_ids. - + Returns: - tuple[list[int], ...]: A tuple of lists where - * the outer tuple corresponds to KV cache groups - * each inner list contains the block_ids of the blocks in that group + tuple[list[int], ...]: A tuple of lists where: + - the outer tuple corresponds to KV cache groups + - each inner list contains the block_ids of the blocks in that + group """ if allow_none and all(len(group) == 0 for group in self.blocks): return None diff --git a/vllm/v1/executor/ray_distributed_executor.py b/vllm/v1/executor/ray_distributed_executor.py index c05ad1966d611..8394ae788ab01 100644 --- a/vllm/v1/executor/ray_distributed_executor.py +++ b/vllm/v1/executor/ray_distributed_executor.py @@ -8,6 +8,7 @@ from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator from vllm.executor.ray_distributed_executor import ( # noqa RayDistributedExecutor as RayDistributedExecutorV0) from vllm.logger import init_logger +from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType from vllm.v1.executor.abstract import Executor from vllm.v1.outputs import ModelRunnerOutput @@ -64,7 +65,7 @@ class RayDistributedExecutor(RayDistributedExecutorV0, Executor): def execute_model( self, - scheduler_output, + scheduler_output: SchedulerOutput, ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]: """Execute the model on the Ray workers. diff --git a/vllm/v1/metrics/prometheus.py b/vllm/v1/metrics/prometheus.py index 61ba5d66cb31a..a43cf9ce255e6 100644 --- a/vllm/v1/metrics/prometheus.py +++ b/vllm/v1/metrics/prometheus.py @@ -36,7 +36,7 @@ def setup_multiprocess_prometheus(): "and vLLM will properly handle cleanup.") -def get_prometheus_registry(): +def get_prometheus_registry() -> CollectorRegistry: """Get the appropriate prometheus registry based on multiprocessing configuration. diff --git a/vllm/v1/sample/logits_processor/interface.py b/vllm/v1/sample/logits_processor/interface.py index 16cd00943db8d..683fc7c00dfb2 100644 --- a/vllm/v1/sample/logits_processor/interface.py +++ b/vllm/v1/sample/logits_processor/interface.py @@ -91,7 +91,7 @@ class LogitsProcessor(ABC): to each forward pass. Args: - batch_update is non-None iff there have been - changes to the batch makeup. + batch_update: Non-None iff there have been changes + to the batch makeup. """ raise NotImplementedError diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py index b2354c53302ad..2d9ce3101b6c9 100644 --- a/vllm/v1/sample/rejection_sampler.py +++ b/vllm/v1/sample/rejection_sampler.py @@ -68,7 +68,7 @@ class RejectionSampler(nn.Module): different requests are flattened into a single tensor because this is the shape of the output logits. NOTE: `target_logits` can be updated in place to save memory. - bonus_token_ids_tensor (torch.Tensor): + bonus_token_ids (torch.Tensor): A tensor containing bonus tokens. Shape is [batch_size, 1]. Bonus tokens are added to the end of the sequence if all proposed tokens are accepted. We generate the bonus tokens diff --git a/vllm/v1/sample/tpu/sampler.py b/vllm/v1/sample/tpu/sampler.py index 04545d587e4a9..e84136e3a6d07 100644 --- a/vllm/v1/sample/tpu/sampler.py +++ b/vllm/v1/sample/tpu/sampler.py @@ -89,7 +89,7 @@ class Sampler(nn.Module): Gather logprobs for topk and sampled/prompt token. Args: - logits: (num tokens) x (vocab) tensor + logprobs: (num tokens) x (vocab) tensor num_logprobs: minimum number of logprobs to retain per token token_ids: prompt tokens (if prompt logprobs) diff --git a/vllm/v1/structured_output/backend_types.py b/vllm/v1/structured_output/backend_types.py index d500783aa4b30..9a53aa7a1ad10 100644 --- a/vllm/v1/structured_output/backend_types.py +++ b/vllm/v1/structured_output/backend_types.py @@ -110,7 +110,7 @@ class StructuredOutputBackend(ABC): Args: request_type (StructuredOutputOptions): The type of structured - output request. + output request. grammar_spec (str): The grammar specification to compile. Returns: @@ -124,7 +124,7 @@ class StructuredOutputBackend(ABC): Args: max_num_seqs (int): The maximum number of sequences for which - to allocate the bitmask. + to allocate the bitmask. """ @abstractmethod diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 284af6bfedce0..f4c2f45df5954 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -525,9 +525,6 @@ class InputBatch: Any consecutive empty indices at the very end of the list are not filled. - Args: - empty_req_indices: empty indices which may be filled. - Returns: swaps: list of (from,to) swap tuples for moved requests empty_req_indices: indices not filled by condensation diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 20d2d20ba0967..01c90b2ea38d3 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2955,7 +2955,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): Args: kv_cache_config: The KV cache config kv_cache_raw_tensors: The KV cache buffer of each layer, with - correct size but uninitialized shape. + correct size but uninitialized shape. Returns: Dict[str, torch.Tensor]: A map between layer names to their corresponding memory buffer for KV cache. diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index d364236604274..70ffde39ca333 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -552,7 +552,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): return kv_cache_spec def _get_slot_mapping_metadata(self, num_reqs, - num_scheduled_tokens_per_req): + num_scheduled_tokens_per_req) -> np.ndarray: """ Computes metadata for mapping slots to blocks in the key-value (KV) cache for a batch of requests. @@ -565,15 +565,15 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): Args: num_reqs (int): Number of requests in the current batch. num_scheduled_tokens_per_req (int or np.ndarray): Number of tokens - to be scheduled for each request. + to be scheduled for each request. Returns: np.ndarray: A 2D array of shape (total_block_len, 3), where each row - contains: + contains: - kv_cache_start_index (int): The starting index in the KV cache - for the corresponding slice. + for the corresponding slice. - new_kv_start_index (int): The starting index in the new KV - cache for the corresponding slice. + cache for the corresponding slice. - slice_len (int): The length of the slice. """ slices_start = self.input_batch.num_computed_tokens_cpu[:num_reqs] diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index f407534687662..a519336e41616 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -172,10 +172,10 @@ def scatter_mm_placeholders( Args: embeds: The multimodal embeddings. - Shape: `(num_embeds, embed_dim)` + Shape: `(num_embeds, embed_dim)` is_embed: A boolean mask indicating which positions in the placeholder - tokens need to be filled with multimodal embeddings. - Shape: `(num_placeholders, num_embeds)` + tokens need to be filled with multimodal embeddings. + Shape: `(num_placeholders, num_embeds)` """ if is_embed is None: return embeds @@ -278,7 +278,7 @@ def bind_kv_cache( Args: kv_caches: The allocated kv_caches with layer names as keys. forward_context: The global forward context containing all Attention - layers with layer names as keys. + layers with layer names as keys. runner_kv_caches: The kv_cache declared by ModelRunner. """ # Bind kv_caches to ModelRunner diff --git a/vllm/v1/worker/worker_base.py b/vllm/v1/worker/worker_base.py index 9c93754f93f81..038ce4b54f960 100644 --- a/vllm/v1/worker/worker_base.py +++ b/vllm/v1/worker/worker_base.py @@ -36,8 +36,8 @@ class WorkerBase(WorkerBaseV0): local_rank: Local device index rank: Global rank in distributed setup distributed_init_method: Distributed initialization method - is_driver_worker: Whether this worker handles driver - responsibilities + is_driver_worker: Whether this worker handles driver + responsibilities """ # Configuration storage super().__init__(vllm_config=vllm_config) From 3c0ef769bace3d48b276c7233ed6f39fe03f95b7 Mon Sep 17 00:00:00 2001 From: Eli Uriegas <1700823+seemethere@users.noreply.github.com> Date: Wed, 27 Aug 2025 10:41:48 -0700 Subject: [PATCH 101/112] ci: Add arm64 docker build to release pipeline (#23210) Signed-off-by: Eli Uriegas <eliuriegas@meta.com> Signed-off-by: Eli Uriegas <1700823+seemethere@users.noreply.github.com> --- .buildkite/release-pipeline.yaml | 38 +++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index f96c38bf57db7..86aae426c258c 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -7,7 +7,7 @@ steps: commands: # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here: # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7 - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "bash .buildkite/scripts/upload-wheels.sh" @@ -62,23 +62,49 @@ steps: env: DOCKER_BUILDKIT: "1" - - block: "Build release image" + - block: "Build release image (x86)" depends_on: ~ key: block-release-image-build - - label: "Build release image" + - label: "Build release image (x86)" depends_on: block-release-image-build - id: build-release-image + id: build-release-image-x86 agents: queue: cpu_queue_postmerge commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ." + - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)" + # re-tag to default image tag and push, just in case arm64 build fails + - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" + - label: "Build release image (arm64)" + depends_on: block-release-image-build + id: build-release-image-arm64 + agents: + queue: arm64_cpu_queue_postmerge + commands: + - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ." + - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)" + + # Add job to create multi-arch manifest + - label: "Create multi-arch manifest" + depends_on: + - build-release-image-x86 + - build-release-image-arm64 + id: create-multi-arch-manifest + agents: + queue: cpu_queue_postmerge + commands: + - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" + - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend" + - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" + - label: "Annotate release workflow" depends_on: - - build-release-image + - create-multi-arch-manifest - build-wheel-cuda-12-8 - build-wheel-cuda-12-6 - build-wheel-cuda-11-8 From 0585a9e73c072a8cbb1a64bea3c26dd0d2dde402 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 27 Aug 2025 20:03:05 +0100 Subject: [PATCH 102/112] Disable `torch.compile` for dynamic rope models in Transformers backend (#23738) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/transformers.py | 25 +++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index fc242d1adafd0..dffc347a73668 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -88,6 +88,23 @@ def log_replacement(name: str, old_module: nn.Module, new_module: nn.Module): logger.debug("%s: %s -> %s", name, old_module, new_module) +def can_enable_torch_compile(vllm_config: VllmConfig) -> bool: + """ + Callable to be passed to `@support_torch_compile`'s `enable_if` argument. + + Defaults to `True` but is disabled in the following situations: + + - The model uses dynamic rope scaling. + """ + enable = True + text_config = vllm_config.model_config.hf_config.get_text_config() + # Dynamic rope scaling is not compatible with torch.compile + rope_scaling: dict = getattr(text_config, "rope_scaling", None) or {} + if rope_scaling.get("rope_type") == "dynamic": + enable = False + return enable + + def replace_linear_class( linear: nn.Linear, style: Literal["colwise", "rowwise"], quant_config: QuantizationConfig @@ -641,7 +658,7 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP): return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) -@support_torch_compile +@support_torch_compile(enable_if=can_enable_torch_compile) class TransformersModel(TransformersBase): hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ @@ -653,7 +670,7 @@ class TransformersModel(TransformersBase): }) -@support_torch_compile +@support_torch_compile(enable_if=can_enable_torch_compile) class TransformersForCausalLM(TransformersBase): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @@ -709,12 +726,14 @@ def flatten_and_concat(x: list[torch.Tensor]) -> torch.Tensor: info=MultiModalProcessingInfo, dummy_inputs=MultiModalDummyInputsBuilder) @support_torch_compile( + # set `positions` to last dim to support Qwen-mrope dynamic_arg_dims={ "input_ids": 0, "positions": -1, "intermediate_tensors": 0, "inputs_embeds": 0, - }) # set `positions` to last dim to support Qwen-mrope + }, + enable_if=can_enable_torch_compile) class TransformersForMultimodalLM(TransformersForCausalLM, SupportsMultiModal): # Backwards compatibility for prev released models. State dicts back then # had different formats and cannot be loaded with `AutoModel` mapping as is From 8bf6266a17933b130f94f6d53f32ac029ed8ba1b Mon Sep 17 00:00:00 2001 From: Roger Wang <hey@rogerw.io> Date: Wed, 27 Aug 2025 13:24:31 -0700 Subject: [PATCH 103/112] [Multimodal] Generate mm_hash based on request metadata when caching is turned off (#23690) Signed-off-by: Roger Wang <hey@rogerw.io> --- vllm/inputs/preprocess.py | 71 ++++++++++++++++--- vllm/model_executor/models/deepseek_vl2.py | 3 + vllm/model_executor/models/h2ovl.py | 3 + vllm/model_executor/models/llava.py | 8 ++- vllm/model_executor/models/mllama.py | 8 ++- vllm/model_executor/models/paligemma.py | 8 ++- vllm/model_executor/models/pixtral.py | 2 + .../models/prithvi_geospatial_mae.py | 7 +- vllm/model_executor/models/transformers.py | 7 +- vllm/model_executor/models/voxtral.py | 2 + vllm/multimodal/processing.py | 36 ++++++++-- vllm/v1/engine/processor.py | 48 +++++++++++++ 12 files changed, 179 insertions(+), 24 deletions(-) diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index f0d0cab3df3d9..fff9c42fe36fe 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -257,6 +257,8 @@ class InputPreprocessor: mm_processor_kwargs: Optional[Mapping[str, object]], tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, + *, + mm_hash_overrides: Optional[dict[str, list[str]]] = None, ) -> MultiModalInputs: """ Apply the model's multi-modal processor to a multi-modal prompt, @@ -273,10 +275,13 @@ class InputPreprocessor: if mm_processor_kwargs is None: mm_processor_kwargs = {} - return mm_processor.apply(prompt, - mm_data, - hf_processor_mm_kwargs=mm_processor_kwargs, - tokenization_kwargs=tokenization_kwargs) + return mm_processor.apply( + prompt, + mm_data, + hf_processor_mm_kwargs=mm_processor_kwargs, + tokenization_kwargs=tokenization_kwargs, + mm_hash_overrides=mm_hash_overrides, + ) async def _process_multimodal_async( self, @@ -285,6 +290,8 @@ class InputPreprocessor: mm_processor_kwargs: Optional[Mapping[str, object]], tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, + *, + mm_hash_overrides: Optional[dict[str, list[str]]] = None, ) -> MultiModalInputs: """ Async version of @@ -301,10 +308,13 @@ class InputPreprocessor: if mm_processor_kwargs is None: mm_processor_kwargs = {} - return mm_processor.apply(prompt, - mm_data, - hf_processor_mm_kwargs=mm_processor_kwargs, - tokenization_kwargs=tokenization_kwargs) + return mm_processor.apply( + prompt, + mm_data, + hf_processor_mm_kwargs=mm_processor_kwargs, + tokenization_kwargs=tokenization_kwargs, + mm_hash_overrides=mm_hash_overrides, + ) def _process_embeds( self, @@ -341,6 +351,8 @@ class InputPreprocessor: parsed_content: TokensPrompt, tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, + *, + mm_hash_overrides: Optional[dict[str, list[str]]] = None, ) -> Union[TokenInputs, MultiModalInputs]: prompt_token_ids = parsed_content["prompt_token_ids"] token_type_ids = parsed_content.get("token_type_ids") @@ -353,6 +365,7 @@ class InputPreprocessor: parsed_content.get("mm_processor_kwargs"), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, + mm_hash_overrides=mm_hash_overrides, ) else: inputs = token_inputs( @@ -370,6 +383,8 @@ class InputPreprocessor: parsed_content: TokensPrompt, tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, + *, + mm_hash_overrides: Optional[dict[str, list[str]]] = None, ) -> Union[TokenInputs, MultiModalInputs]: prompt_token_ids = parsed_content["prompt_token_ids"] token_type_ids = parsed_content.get("token_type_ids") @@ -382,6 +397,7 @@ class InputPreprocessor: parsed_content.get("mm_processor_kwargs"), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, + mm_hash_overrides=mm_hash_overrides, ) else: inputs = token_inputs( @@ -399,6 +415,8 @@ class InputPreprocessor: parsed_content: TextPrompt, tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, + *, + mm_hash_overrides: Optional[dict[str, list[str]]] = None, ) -> Union[TokenInputs, MultiModalInputs]: prompt_text = parsed_content["prompt"] @@ -410,6 +428,7 @@ class InputPreprocessor: parsed_content.get("mm_processor_kwargs"), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, + mm_hash_overrides=mm_hash_overrides, ) else: prompt_token_ids = self._tokenize_prompt( @@ -432,6 +451,8 @@ class InputPreprocessor: parsed_content: TextPrompt, tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, + *, + mm_hash_overrides: Optional[dict[str, list[str]]] = None, ) -> Union[TokenInputs, MultiModalInputs]: prompt_text = parsed_content["prompt"] @@ -443,6 +464,7 @@ class InputPreprocessor: parsed_content.get("mm_processor_kwargs"), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, + mm_hash_overrides=mm_hash_overrides, ) else: prompt_token_ids = await self._tokenize_prompt_async( @@ -465,6 +487,8 @@ class InputPreprocessor: prompt: SingletonPrompt, tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, + *, + mm_hash_overrides: Optional[dict[str, list[str]]] = None, ) -> SingletonInputs: """ Extract the singleton inputs from a prompt. @@ -486,18 +510,21 @@ class InputPreprocessor: return self._process_tokens( parsed["content"], lora_request=lora_request, + mm_hash_overrides=mm_hash_overrides, ) if parsed["type"] == "text": return self._process_text( parsed["content"], tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, + mm_hash_overrides=mm_hash_overrides, ) if parsed["type"] == "str": return self._process_text( TextPrompt(prompt=parsed["content"]), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, + mm_hash_overrides=mm_hash_overrides, ) assert_never(parsed) @@ -507,6 +534,8 @@ class InputPreprocessor: prompt: SingletonPrompt, tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, + *, + mm_hash_overrides: Optional[dict[str, list[str]]] = None, ) -> SingletonInputs: """ Async version of @@ -520,18 +549,21 @@ class InputPreprocessor: return await self._process_tokens_async( parsed["content"], lora_request=lora_request, + mm_hash_overrides=mm_hash_overrides, ) if parsed["type"] == "text": return await self._process_text_async( parsed["content"], tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, + mm_hash_overrides=mm_hash_overrides, ) if parsed["type"] == "str": return await self._process_text_async( TextPrompt(prompt=parsed["content"]), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, + mm_hash_overrides=mm_hash_overrides, ) assert_never(parsed) @@ -641,6 +673,8 @@ class InputPreprocessor: self, prompt: PromptType, tokenization_kwargs: Optional[dict[str, Any]] = None, + *, + mm_hash_overrides: Optional[dict[str, list[str]]] = None, ) -> EncoderDecoderInputs: """ For encoder/decoder models only: @@ -682,6 +716,7 @@ class InputPreprocessor: encoder_inputs = self._prompt_to_llm_inputs( prompt["encoder_prompt"], tokenization_kwargs=tokenization_kwargs, + mm_hash_overrides=mm_hash_overrides, ) if (decoder_input := prompt["decoder_prompt"]) is None: decoder_inputs = None @@ -697,6 +732,7 @@ class InputPreprocessor: inputs = self._prompt_to_llm_inputs( prompt, tokenization_kwargs=tokenization_kwargs, + mm_hash_overrides=mm_hash_overrides, ) if self.model_config.is_multimodal_model: # Encoder-Decoder Multimodal model @@ -712,6 +748,8 @@ class InputPreprocessor: self, prompt: PromptType, tokenization_kwargs: Optional[dict[str, Any]] = None, + *, + mm_hash_overrides: Optional[dict[str, list[str]]] = None, ) -> EncoderDecoderInputs: """ Async version of @@ -724,6 +762,7 @@ class InputPreprocessor: encoder_task = self._prompt_to_llm_inputs_async( prompt["encoder_prompt"], tokenization_kwargs=tokenization_kwargs, + mm_hash_overrides=mm_hash_overrides, ) if (decoder_input := prompt["decoder_prompt"]) is None: @@ -733,6 +772,7 @@ class InputPreprocessor: decoder_task = self._prompt_to_llm_inputs_async( decoder_input, tokenization_kwargs=tokenization_kwargs, + mm_hash_overrides=mm_hash_overrides, ) encoder_inputs, decoder_inputs = await asyncio.gather( @@ -748,6 +788,7 @@ class InputPreprocessor: inputs = await self._prompt_to_llm_inputs_async( prompt, tokenization_kwargs=tokenization_kwargs, + mm_hash_overrides=mm_hash_overrides, ) if self.model_config.is_multimodal_model: # Encoder-Decoder Multimodal model @@ -774,6 +815,8 @@ class InputPreprocessor: prompt: SingletonPrompt, tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, + *, + mm_hash_overrides: Optional[dict[str, list[str]]] = None, ) -> DecoderOnlyInputs: """ For decoder-only models: @@ -794,6 +837,7 @@ class InputPreprocessor: prompt, tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, + mm_hash_overrides=mm_hash_overrides, ) return self._build_decoder_only_llm_inputs(prompt_comps) @@ -803,6 +847,8 @@ class InputPreprocessor: prompt: SingletonPrompt, tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, + *, + mm_hash_overrides: Optional[dict[str, list[str]]] = None, ) -> DecoderOnlyInputs: """ Async version of @@ -812,6 +858,7 @@ class InputPreprocessor: prompt, tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, + mm_hash_overrides=mm_hash_overrides, ) return self._build_decoder_only_llm_inputs(prompt_comps) @@ -821,6 +868,8 @@ class InputPreprocessor: prompt: PromptType, tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, + *, + mm_hash_overrides: Optional[dict[str, list[str]]] = None, ) -> ProcessorInputs: """Preprocess the input prompt.""" if self.model_config.is_encoder_decoder: @@ -829,6 +878,7 @@ class InputPreprocessor: return self._process_encoder_decoder_prompt( prompt, tokenization_kwargs, + mm_hash_overrides=mm_hash_overrides, ) if is_explicit_encoder_decoder_prompt(prompt): @@ -840,6 +890,7 @@ class InputPreprocessor: prompt, tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, + mm_hash_overrides=mm_hash_overrides, ) async def preprocess_async( @@ -847,6 +898,8 @@ class InputPreprocessor: prompt: PromptType, tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, + *, + mm_hash_overrides: Optional[dict[str, list[str]]] = None, ) -> ProcessorInputs: """ Async version of @@ -858,6 +911,7 @@ class InputPreprocessor: return await self._process_encoder_decoder_prompt_async( prompt, tokenization_kwargs, + mm_hash_overrides=mm_hash_overrides, ) if is_explicit_encoder_decoder_prompt(prompt): @@ -869,6 +923,7 @@ class InputPreprocessor: prompt, tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, + mm_hash_overrides=mm_hash_overrides, ) def clear_cache(self) -> None: diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index ceb5e1364b68d..1bd2802a86838 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -290,6 +290,7 @@ class DeepseekVL2MultiModalProcessor( mm_data_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], + mm_hash_overrides: Optional[dict[str, list[str]]] = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: # The processor logic is different for len(images) <= 2 vs > 2 # Since the processing cache assumes that the processor output is @@ -301,6 +302,7 @@ class DeepseekVL2MultiModalProcessor( mm_data_items=mm_data_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, + mm_hash_overrides=mm_hash_overrides, ) return super()._cached_apply_hf_processor( @@ -308,6 +310,7 @@ class DeepseekVL2MultiModalProcessor( mm_data_items=mm_data_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, + mm_hash_overrides=mm_hash_overrides, ) diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py index 87e451a2769ea..306775af68065 100644 --- a/vllm/model_executor/models/h2ovl.py +++ b/vllm/model_executor/models/h2ovl.py @@ -479,6 +479,7 @@ class H2OVLMultiModalProcessor( mm_data_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], + mm_hash_overrides: Optional[dict[str, list[str]]] = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: # The processor logic is different for len(images) <= 1 vs > 1 # Since the processing cache assumes that the processor output is @@ -490,6 +491,7 @@ class H2OVLMultiModalProcessor( mm_data_items=mm_data_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, + mm_hash_overrides=mm_hash_overrides, ) return super()._cached_apply_hf_processor( @@ -497,6 +499,7 @@ class H2OVLMultiModalProcessor( mm_data_items=mm_data_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, + mm_hash_overrides=mm_hash_overrides, ) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 0ee26b68345c3..8a847a6180f3a 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -795,6 +795,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor): mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Optional[Mapping[str, object]] = None, + mm_hash_overrides: Optional[dict[str, list[str]]] = None, ) -> MultiModalInputs: hf_config = self.info.get_hf_config() image_token_id = hf_config.image_token_index @@ -805,8 +806,11 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor): image_height=-1, ) - result = super().apply(prompt, mm_data, hf_processor_mm_kwargs, - tokenization_kwargs) + result = super().apply(prompt, + mm_data, + hf_processor_mm_kwargs, + tokenization_kwargs, + mm_hash_overrides=mm_hash_overrides) mm_items = self._to_mm_items(mm_data) mm_item_counts = mm_items.get_all_counts() diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 2a60450de4141..cc2216996f032 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -184,9 +184,13 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo] mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Optional[Mapping[str, object]] = None, + mm_hash_overrides: Optional[dict[str, list[str]]] = None, ) -> MultiModalEncDecInputs: - mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs, - tokenization_kwargs) + mm_inputs = super().apply(prompt, + mm_data, + hf_processor_mm_kwargs, + tokenization_kwargs, + mm_hash_overrides=mm_hash_overrides) image_token_id = self.info.get_hf_config().image_token_index # Check that the number of image tokens in the decoder prompt matches diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 95abb190e0a46..b74a09ee92c33 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -203,9 +203,13 @@ class PaliGemmaMultiModalProcessor( mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Optional[Mapping[str, object]] = None, + mm_hash_overrides: Optional[dict[str, list[str]]] = None, ) -> MultiModalInputs: - mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs, - tokenization_kwargs) + mm_inputs = super().apply(prompt, + mm_data, + hf_processor_mm_kwargs, + tokenization_kwargs, + mm_hash_overrides=mm_hash_overrides) prompt_token_ids = mm_inputs["prompt_token_ids"] tokenizer = self.info.get_tokenizer() diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 461b9c85d1c22..a74e01a59697e 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -314,12 +314,14 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo] mm_data_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], + mm_hash_overrides: Optional[dict[str, list[str]]] = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: prompt_ids, mm_info, _ = super()._cached_apply_hf_processor( prompt=prompt, mm_data_items=mm_data_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, + mm_hash_overrides=mm_hash_overrides, ) # NOTE: The tokens are already inserted by the chat template diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py index 2d14fe6d5892f..2edc357d2df1b 100644 --- a/vllm/model_executor/models/prithvi_geospatial_mae.py +++ b/vllm/model_executor/models/prithvi_geospatial_mae.py @@ -138,6 +138,7 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor): mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Optional[Mapping[str, object]] = None, + mm_hash_overrides: Optional[dict[str, list[str]]] = None, ) -> MultiModalInputs: if "image" in mm_data: image_data = mm_data["image"] @@ -146,8 +147,10 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor): mm_data = {"image": mm_data} mm_items = self._to_mm_items(mm_data) - mm_hashes = self._hash_mm_items(mm_items, hf_processor_mm_kwargs, - tokenization_kwargs or {}) + tokenization_kwargs = tokenization_kwargs or {} + mm_hashes = (mm_hash_overrides if mm_hash_overrides is not None else + self._hash_mm_items(mm_items, hf_processor_mm_kwargs, + tokenization_kwargs)) mm_placeholders = {"image": [PlaceholderRange(offset=0, length=0)]} mm_processed_data = BatchFeature(image_data) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index dffc347a73668..edf3dddb1bad2 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -327,6 +327,7 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]): mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Optional[Mapping[str, object]] = None, + mm_hash_overrides: Optional[dict[str, list[str]]] = None, ) -> MultiModalInputs: """ Process multi-modal inputs to be used in vLLM. @@ -393,9 +394,11 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]): self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs, num_image_patches), ) + # Use overrides if provided; fallback to data-dependent hashing. + mm_hashes = (mm_hash_overrides if mm_hash_overrides is not None else + self._hash_mm_items(mm_items, hf_processor_mm_kwargs, + tokenization_kwargs)) - mm_hashes = self._hash_mm_items(mm_items, hf_processor_mm_kwargs, - tokenization_kwargs) return MultiModalInputs( type="multimodal", prompt=prompt, diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index 77f11a691e080..eed8d89ca4f5a 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -288,12 +288,14 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo] mm_data_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], + mm_hash_overrides: Optional[dict[str, list[str]]] = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: prompt_ids, mm_info, _ = super()._cached_apply_hf_processor( prompt=prompt, mm_data_items=mm_data_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, + mm_hash_overrides=mm_hash_overrides, ) # NOTE: The tokens are already inserted by the chat template diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 6ecdf80d4aa6f..41595df2e2624 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1020,8 +1020,13 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): prompt: str, mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], + *, + mm_hash_overrides: Optional[MultiModalHashes] = None, ) -> MultiModalInputs: - return self.apply(prompt, mm_data, hf_processor_mm_kwargs) + return self.apply(prompt, + mm_data, + hf_processor_mm_kwargs, + mm_hash_overrides=mm_hash_overrides) def _get_data_parser(self) -> MultiModalDataParser: """ @@ -1357,7 +1362,11 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], ) -> MultiModalHashes: - """Create MM hashes to be returned (only used in V1).""" + """Create MM hashes to be returned (only used in V1). + + Note: When overrides are provided via callers of `apply`, + `_hash_mm_items` will be bypassed and the overrides will be used. + """ model_id = self.info.model_id return { @@ -1464,6 +1473,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): mm_data_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], + *, + mm_hash_overrides: Optional[MultiModalHashes] = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: ( prompt_ids, @@ -1483,8 +1494,10 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): hf_processor_mm_kwargs), ) - mm_hashes = self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs, - tokenization_kwargs) + # Use overrides if provided; fallback to data-dependent hashing. + mm_hashes = (mm_hash_overrides if mm_hash_overrides is not None else + self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs, + tokenization_kwargs)) mm_prompt_updates = self._get_mm_prompt_updates( mm_data_items, @@ -1506,6 +1519,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): mm_data_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], + *, + mm_hash_overrides: Optional[MultiModalHashes] = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: """ Apply the HF processor on the full prompt text, @@ -1520,10 +1535,13 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): mm_data_items=mm_data_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, + mm_hash_overrides=mm_hash_overrides, ) - mm_hashes = self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs, - tokenization_kwargs) + # Use overrides if provided; fallback to data-dependent hashing. + mm_hashes = (mm_hash_overrides if mm_hash_overrides is not None else + self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs, + tokenization_kwargs)) mm_missing_data_items = self._get_cache_missing_items( cache=cache, @@ -1723,6 +1741,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Optional[Mapping[str, object]] = None, + *, + mm_hash_overrides: Optional[dict[str, list[str]]] = None, ) -> MultiModalInputs: """ Process multi-modal inputs to be used in vLLM. @@ -1751,6 +1771,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): mm_items, hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, + mm_hash_overrides=mm_hash_overrides, ) # NOTE: tokenization_kwargs are not required to init processor @@ -1835,6 +1856,8 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]): mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Optional[Mapping[str, object]] = None, + *, + mm_hash_overrides: Optional[MultiModalHashes] = None, ) -> MultiModalEncDecInputs: """ Process multi-modal inputs to be used in vLLM. @@ -1849,6 +1872,7 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]): mm_data, hf_processor_mm_kwargs, tokenization_kwargs, + mm_hash_overrides=mm_hash_overrides, ) return self._get_enc_dec_inputs( diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 7ed60156626bf..df915258d8637 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -225,6 +225,41 @@ class Processor: # Remember that this backend was set automatically params.guided_decoding.backend_was_auto = True + def _maybe_build_mm_hash_overrides( + self, + request_id: str, + prompt: PromptType, + ) -> Optional[dict[str, list[str]]]: + """Build per-item multimodal hash overrides when enabled. In this case, + multimodal data items are identified by their request id, modality and + index rather than their content. + + Returns a dictionary of modality -> list[str] of overrides, or None if + disabled or no multimodal data is present. + """ + + def _extract_mm_data(p: PromptType): + if isinstance(p, dict) and "encoder_prompt" in p: + enc = p.get("encoder_prompt") + if isinstance(enc, dict): + return enc.get("multi_modal_data") + return None + if isinstance(p, dict): + return p.get("multi_modal_data") + return None + + mm_data = _extract_mm_data(prompt) + if not mm_data: + return None + + overrides: dict[str, list[str]] = {} + for modality, data in mm_data.items(): + n = len(data) if isinstance(data, list) else 1 + overrides[modality] = [ + f"{request_id}-{modality}-{i}" for i in range(n) + ] + return overrides + def process_inputs( self, request_id: str, @@ -254,6 +289,18 @@ class Processor: if arrival_time is None: arrival_time = time.time() + # Optionally generate multimodal hash overrides based on request id. + # NOTE: when users explicitly turn off BOTH prefix caching and input + # processing caching, no multimodal features or embeddings will be + # reused across requests, therefore hashing is no longer necessary. + if (self.model_config.multimodal_config and + self.model_config.multimodal_config.mm_processor_cache_gb == 0 + and not self.cache_config.enable_prefix_caching): + mm_hash_overrides = self._maybe_build_mm_hash_overrides( + request_id, prompt) + else: + mm_hash_overrides = None + # Process inputs, which includes: # 1. Tokenize text prompt, with LoRA request if one exists. # 2. For multimodal models with a merged preprocessor, preprocess @@ -262,6 +309,7 @@ class Processor: prompt, tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, + mm_hash_overrides=mm_hash_overrides, ) from vllm.platforms import current_platform current_platform.validate_request( From 853c371fc33e7c99aa2ab9f6e2cd7cbd1cadcf99 Mon Sep 17 00:00:00 2001 From: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com> Date: Wed, 27 Aug 2025 23:53:30 +0300 Subject: [PATCH 104/112] [V1][Mamba] - Enable V1 by default for Mamba Models (#23650) Signed-off-by: asafg <39553475+Josephasafg@users.noreply.github.com> --- .../models/language/generation/test_hybrid.py | 147 ++++++++---------- vllm/engine/arg_utils.py | 5 - vllm/model_executor/models/config.py | 1 + 3 files changed, 70 insertions(+), 83 deletions(-) diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index 7e7cc893ec8aa..31ca3a6f0f985 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -100,21 +100,19 @@ def test_models( else: hf_outputs = None - if model not in V0_UNSUPPORTED_MODELS: - with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model: - vllm_v0_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - else: - vllm_v0_outputs = None + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "0") + if model not in V0_UNSUPPORTED_MODELS: + with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model: + vllm_v0_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) + else: + vllm_v0_outputs = None if model in V1_SUPPORTED_MODELS: - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - with vllm_runner(model, - max_num_seqs=MAX_NUM_SEQS, - enable_prefix_caching=False) as vllm_model: - vllm_v1_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) + with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model: + vllm_v1_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) else: vllm_v1_outputs = None @@ -137,7 +135,7 @@ def test_models( ) -@pytest.mark.parametrize("model", SSM_MODELS + HYBRID_MODELS) +@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]]) @pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("num_logprobs", [5]) def test_batching( @@ -147,10 +145,6 @@ def test_batching( max_tokens: int, num_logprobs: int, ) -> None: - if model in V0_UNSUPPORTED_MODELS: - pytest.skip( - f"Unsupported V0 Engine. Skipping `test_batching` on {model}.") - try: model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_available_online(on_fail="skip") @@ -188,29 +182,32 @@ def test_chunked_prefill( max_tokens: int, num_logprobs: int, chunked_prefill_token_size: int, + monkeypatch, ) -> None: max_num_seqs = chunked_prefill_token_size max_num_batched_tokens = chunked_prefill_token_size - with vllm_runner(model, - enable_chunked_prefill=True, - max_num_batched_tokens=max_num_batched_tokens, - max_num_seqs=max_num_seqs) as vllm_model: - chunked = vllm_model.generate_greedy_logprobs(example_prompts, - max_tokens, num_logprobs) + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "0") + with vllm_runner(model, + enable_chunked_prefill=True, + max_num_batched_tokens=max_num_batched_tokens, + max_num_seqs=max_num_seqs) as vllm_model: + chunked = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) - with vllm_runner(model, - enable_chunked_prefill=False, - max_num_seqs=max_num_seqs) as vllm_model: - non_chunked = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) + with vllm_runner(model, + enable_chunked_prefill=False, + max_num_seqs=max_num_seqs) as vllm_model: + non_chunked = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) - check_logprobs_close( - outputs_0_lst=chunked, - outputs_1_lst=non_chunked, - name_0="chunked", - name_1="non_chunked", - ) + check_logprobs_close( + outputs_0_lst=chunked, + outputs_1_lst=non_chunked, + name_0="chunked", + name_1="non_chunked", + ) @pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]]) @@ -281,25 +278,29 @@ def test_models_preemption_recompute( example_prompts, model: str, max_tokens: int, + monkeypatch, ) -> None: """ Tests that outputs are identical with and w/o preemptions (recompute). """ - with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model: - scheduler = vllm_model.llm.llm_engine.scheduler[0] - scheduler.ENABLE_ARTIFICIAL_PREEMPT = True - preempt_vllm_outputs = vllm_model.generate_greedy( - example_prompts, max_tokens) + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "0") + with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model: + scheduler = vllm_model.llm.llm_engine.scheduler[0] + scheduler.ENABLE_ARTIFICIAL_PREEMPT = True + preempt_vllm_outputs = vllm_model.generate_greedy( + example_prompts, max_tokens) - scheduler.ENABLE_ARTIFICIAL_PREEMPT = False - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + scheduler.ENABLE_ARTIFICIAL_PREEMPT = False + vllm_outputs = vllm_model.generate_greedy(example_prompts, + max_tokens) - check_outputs_equal( - outputs_0_lst=preempt_vllm_outputs, - outputs_1_lst=vllm_outputs, - name_0="vllm_preepmtions", - name_1="vllm", - ) + check_outputs_equal( + outputs_0_lst=preempt_vllm_outputs, + outputs_1_lst=vllm_outputs, + name_0="vllm_preepmtions", + name_1="vllm", + ) @pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]]) @@ -402,24 +403,18 @@ def test_full_cuda_graph( else: hf_outputs = None - if model not in V0_UNSUPPORTED_MODELS: - with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model: - vllm_v0_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - else: - vllm_v0_outputs = None - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - if model in HYBRID_MODELS: - # required due to reorder_batch behaviour - m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER") - with vllm_runner(model, - max_num_seqs=MAX_NUM_SEQS, - compilation_config={'full_cuda_graph': True}, - enable_prefix_caching=False) as vllm_model: - vllm_v1_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) + m.setenv("VLLM_USE_V1", "0") + if model not in V0_UNSUPPORTED_MODELS: + with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model: + vllm_v0_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) + else: + vllm_v0_outputs = None + + with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model: + vllm_v1_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) if hf_outputs is not None and vllm_v0_outputs is not None: check_logprobs_close( @@ -466,24 +461,20 @@ def test_fp32_state( else: hf_outputs = None + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "0") + with vllm_runner(model, + max_num_seqs=MAX_NUM_SEQS, + mamba_ssm_cache_dtype="float32") as vllm_model: + vllm_v0_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) + with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS, mamba_ssm_cache_dtype="float32") as vllm_model: - vllm_v0_outputs = vllm_model.generate_greedy_logprobs( + vllm_v1_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - if model in HYBRID_MODELS: - # required due to reorder_batch behaviour - m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER") - with vllm_runner(model, - max_num_seqs=MAX_NUM_SEQS, - mamba_ssm_cache_dtype="float32", - enable_prefix_caching=False) as vllm_model: - vllm_v1_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - if hf_outputs is not None: check_logprobs_close( outputs_0_lst=hf_outputs, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 3399d505e3631..e4d205aeb8633 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1463,11 +1463,6 @@ class EngineArgs: recommend_to_remove=False) return False - # V1 mamba models are unoptimized. - if model_config.has_inner_state and _warn_or_fallback( - feature_name="Mamba"): - return False - # No Concurrent Partial Prefills so far. if (self.max_num_partial_prefills != SchedulerConfig.max_num_partial_prefills diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 88b3154de2cbb..b0dbfacece3ab 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -417,4 +417,5 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = { "GptOssForCausalLM": GptOssForCausalLMConfig, "MambaForCausalLM": MambaModelConfig, "Mamba2ForCausalLM": MambaModelConfig, + "FalconMambaForCausalLM": MambaModelConfig, } From 082cc07ef8f810bea61eaed77a60137684ca78f8 Mon Sep 17 00:00:00 2001 From: Yongye Zhu <zyy1102000@gmail.com> Date: Wed, 27 Aug 2025 17:33:21 -0400 Subject: [PATCH 105/112] DP/EP Support for gpt-oss with deepep-ht comm kernel on SM100 (#23608) --- .../base_device_communicator.py | 2 +- .../model_executor/layers/fused_moe/config.py | 6 + vllm/model_executor/layers/fused_moe/layer.py | 6 +- .../layers/fused_moe/trtllm_moe.py | 197 ++++++++++++++++++ vllm/model_executor/layers/fused_moe/utils.py | 16 ++ .../compressed_tensors_moe.py | 8 +- .../model_executor/layers/quantization/fp8.py | 1 + .../layers/quantization/modelopt.py | 2 + .../layers/quantization/mxfp4.py | 110 ++++++++++ .../layers/quantization/utils/mxfp4_utils.py | 9 +- .../layers/quantization/utils/mxfp8_utils.py | 20 ++ 11 files changed, 365 insertions(+), 12 deletions(-) create mode 100644 vllm/model_executor/layers/fused_moe/trtllm_moe.py create mode 100644 vllm/model_executor/layers/quantization/utils/mxfp8_utils.py diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py index 9e5aa4e4c2a89..9131582eef754 100644 --- a/vllm/distributed/device_communicators/base_device_communicator.py +++ b/vllm/distributed/device_communicators/base_device_communicator.py @@ -255,7 +255,7 @@ class DeviceCommunicatorBase: if module.__class__.__name__ == "FusedMoE" ] for module in moe_modules: - module.quant_method.init_prepare_finalize() + module.quant_method.init_prepare_finalize(module) def dispatch( self, hidden_states: torch.Tensor, diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index 7c1a7b636a9c2..cab610decf901 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -450,6 +450,12 @@ class FusedMoEConfig: if quant_dtype is None and isinstance(quant_config, Fp8Config): quant_dtype = torch.float8_e4m3fn + from vllm.model_executor.layers.quantization.mxfp4 import ( + Mxfp4Config) + if (quant_dtype is None and isinstance(quant_config, Mxfp4Config) + and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8): + quant_dtype = "mxfp8" + from vllm.model_executor.layers.quantization.modelopt import ( ModelOptNvFp4Config) if quant_dtype is None and isinstance(quant_config, diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 54406a5a2d87f..b9de03ddd216e 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -200,7 +200,7 @@ class FusedMoEMethodBase(QuantizeMethodBase): # Note: init_prepare_finalize should only be called by # prepare_communication_buffer_for_model. - def init_prepare_finalize(self): + def init_prepare_finalize(self, layer: torch.nn.Module): assert self.moe is not None prepare_finalize = self.maybe_make_prepare_finalize(self.moe) @@ -211,7 +211,7 @@ class FusedMoEMethodBase(QuantizeMethodBase): assert self.fused_experts is None, \ f"Attempt to override experts for {id(self)}!" self.topk_indices_dtype = prepare_finalize.topk_indices_dtype() - experts = self.select_gemm_impl(prepare_finalize, self.moe) + experts = self.select_gemm_impl(prepare_finalize, self.moe, layer) self.fused_experts = FusedMoEModularKernel( prepare_finalize, experts, @@ -221,6 +221,7 @@ class FusedMoEMethodBase(QuantizeMethodBase): self, prepare_finalize: FusedMoEPrepareAndFinalize, moe: FusedMoEConfig, + layer: torch.nn.Module, ) -> FusedMoEPermuteExpertsUnpermute: # based on the all2all implementation, select the appropriate # gemm implementation @@ -273,6 +274,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): prepare_finalize: FusedMoEPrepareAndFinalize, # TODO(bnell): Remove. Every layer should have an moe config object. moe: FusedMoEConfig, + layer: torch.nn.Module, ) -> FusedMoEPermuteExpertsUnpermute: if (prepare_finalize.activation_format == FusedMoEActivationFormat.BatchedExperts): diff --git a/vllm/model_executor/layers/fused_moe/trtllm_moe.py b/vllm/model_executor/layers/fused_moe/trtllm_moe.py new file mode 100644 index 0000000000000..14dfce4b0e3aa --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/trtllm_moe.py @@ -0,0 +1,197 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Optional + +import torch + +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig +from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( + TopKWeightAndReduceNoOP) +from vllm.utils import next_power_of_2 + + +class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute): + + def __init__( + self, + moe: FusedMoEConfig, + gemm1_alpha, + gemm1_beta, + gemm1_clamp_limit, + w13_bias, + w2_bias, + max_capture_size, + ): + super().__init__(moe.quant_config) + self.moe = moe + self.gemm1_alpha = gemm1_alpha + self.gemm1_beta = gemm1_beta + self.gemm1_clamp_limit = gemm1_clamp_limit + self.w13_bias = w13_bias + self.w2_bias = w2_bias + self.max_capture_size = max_capture_size + + @property + def activation_formats( + self + ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]: + return (mk.FusedMoEActivationFormat.Standard, + mk.FusedMoEActivationFormat.Standard) + + def supports_chunking(self) -> bool: + return True + + def supports_expert_map(self) -> bool: + return True + + def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: + return TopKWeightAndReduceNoOP() + + def workspace_shapes( + self, + a: torch.Tensor, + aq: torch.Tensor, + M: int, + N: int, + K: int, + topk: int, + global_num_experts: int, + local_num_experts: int, + expert_tokens_meta: Optional[mk.ExpertTokensMetadata], + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]: + # The workspaces for this implementation are managed by flashinfer. + # TODO(varun) : workspace1 is could be used as the output tensor. This + # is error-prone. Allow the `workspace_shapes` to return None workspaces + workspace1 = (M, K) + workspace2 = (0, 0) + output = (M, K) + return (workspace1, workspace2, output, a.dtype) + + def _get_tile_tokens_dim(self, x: torch.Tensor, top_k: int, + local_num_experts: int): + # Number of tokens in the input tensor. + num_tokens = x.shape[0] + # Factor to account for the imbalance of the experts. + # factor equals to the + # max_real_num_tokens_per_expert / perfect_num_tokens_per_expert + # 1.0 means perfect expert distribution. + # > 1.0 means some experts have more tokens than the perfect + # distribution. + # < 1.0 does not make sense. + imbalance_factor = 1.3 + # Calculate the number of tokens per expert assuming perfect + # distribution. + num_tokens_per_expert = (num_tokens * top_k) // local_num_experts + # Apply the imbalance factor. + num_tokens_per_expert = int(num_tokens_per_expert * imbalance_factor) + # And pad the number to the next power of 2. + tile_tokens_dim = next_power_of_2(num_tokens_per_expert) + # Cap to 8-64 tokens per CTA tile as it's the range supported by the + # kernel. + tile_tokens_dim = min(max(tile_tokens_dim, 8), 64) + + return tile_tokens_dim + + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: Optional[torch.Tensor], + w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + w1_zp: Optional[torch.Tensor], + w2_zp: Optional[torch.Tensor], + a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_tokens_meta: Optional[mk.ExpertTokensMetadata], + apply_router_weight_on_input: bool, + ): + topk = topk_ids.size(-1) + local_num_experts = w1.size(0) + intermediate_size = w2.size(1) + local_expert_offset = self.moe.ep_rank * local_num_experts + + x_quant = hidden_states + x_scale = a1q_scale + if x_scale is not None: + x_scale = x_scale.view(torch.float8_e4m3fn).reshape( + *x_quant.shape[:-1], -1) + + packed_tensor = (topk_ids.to(torch.int32) << 16) | topk_weights.to( + torch.bfloat16).view(torch.int16) + + assert w1_scale is not None + assert w2_scale is not None + kwargs = { + "topk_ids": + packed_tensor, + "routing_bias": + None, + "hidden_states": + x_quant, + "hidden_states_scale": + x_scale, + "gemm1_weights": + w1, + "gemm1_weights_scale": + w1_scale, + "gemm1_bias": + self.w13_bias, + "gemm1_alpha": + self.gemm1_alpha, + "gemm1_beta": + self.gemm1_beta, + "gemm1_clamp_limit": + self.gemm1_clamp_limit, + "gemm2_weights": + w2, + "gemm2_weights_scale": + w2_scale, + "gemm2_bias": + self.w2_bias, + "output1_scale_scalar": + None, + "output1_scale_gate_scalar": + None, + "output2_scale_scalar": + None, + "num_experts": + global_num_experts, + "top_k": + topk, + "n_group": + None, + "topk_group": + None, + "intermediate_size": + intermediate_size, + "local_expert_offset": + local_expert_offset, + "local_num_experts": + local_num_experts, + "routed_scaling_factor": + None, + "tile_tokens_dim": + self._get_tile_tokens_dim(x_quant, topk, local_num_experts), + "routing_method_type": + 1, + "do_finalize": + True, + "output": + output, + "tune_max_num_tokens": + self.max_capture_size, + } + + from flashinfer import trtllm_fp4_block_scale_routed_moe + trtllm_fp4_block_scale_routed_moe(**kwargs) + return output diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py index 4c3e700ad3990..1aeb3f92bc3ea 100644 --- a/vllm/model_executor/layers/fused_moe/utils.py +++ b/vllm/model_executor/layers/fused_moe/utils.py @@ -12,6 +12,8 @@ from vllm.model_executor.layers.quantization.utils.int8_utils import ( per_token_group_quant_int8, per_token_quant_int8) from vllm.model_executor.layers.quantization.utils.mxfp4_utils import ( quant_dequant_mxfp4) +from vllm.model_executor.layers.quantization.utils.mxfp8_utils import ( + mxfp8_quantize) from vllm.platforms import current_platform from vllm.triton_utils import tl, triton from vllm.utils import cdiv @@ -177,6 +179,18 @@ def _mxfp4_quantize( return A, None +def _mxfp8_quantize( + A: torch.Tensor, + A_scale: Optional[torch.Tensor], + per_act_token_quant: bool, + block_shape: Optional[list[int]] = None, +) -> tuple[torch.Tensor, torch.Tensor]: + assert A_scale is None + assert not per_act_token_quant + assert block_shape is None + return mxfp8_quantize(A) + + def moe_kernel_quantize_input( A: torch.Tensor, A_scale: Optional[torch.Tensor], @@ -195,6 +209,8 @@ def moe_kernel_quantize_input( is_sf_swizzled_layout=is_fp4_scale_swizzled) elif quant_dtype == "mxfp4": return _mxfp4_quantize(A, A_scale, per_act_token_quant, block_shape) + elif quant_dtype == "mxfp8": + return _mxfp8_quantize(A, A_scale, per_act_token_quant, block_shape) else: return A, A_scale diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 6279bb8b60570..af9d1c46f68f4 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -322,6 +322,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): self, prepare_finalize: mk.FusedMoEPrepareAndFinalize, moe: FusedMoEConfig, + layer: torch.nn.Module, ) -> mk.FusedMoEPermuteExpertsUnpermute: """Return the appropriate GEMM experts implementation.""" experts = select_nvfp4_gemm_impl( @@ -719,10 +720,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): dtype=torch.int64) def select_gemm_impl( - self, - prepare_finalize: FusedMoEPrepareAndFinalize, - moe: FusedMoEConfig, - ) -> FusedMoEPermuteExpertsUnpermute: + self, prepare_finalize: FusedMoEPrepareAndFinalize, + moe: FusedMoEConfig, + layer: torch.nn.Module) -> FusedMoEPermuteExpertsUnpermute: # cutlass path if self.use_cutlass: from vllm.model_executor.layers.fused_moe import ( diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index be358cfa949f0..0200b0e9ed001 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -897,6 +897,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): self, prepare_finalize: FusedMoEPrepareAndFinalize, moe: FusedMoEConfig, + layer: torch.nn.Module, ) -> FusedMoEPermuteExpertsUnpermute: from vllm.model_executor.layers.fused_moe import ( BatchedTritonOrDeepGemmExperts, TritonOrDeepGemmExperts) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 72864853f7e0c..adce598c4ff1f 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -311,6 +311,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): self, prepare_finalize: mk.FusedMoEPrepareAndFinalize, moe: FusedMoEConfig, + layer: torch.nn.Module, ) -> mk.FusedMoEPermuteExpertsUnpermute: experts = select_cutlass_fp8_gemm_impl( moe, @@ -1032,6 +1033,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): self, prepare_finalize: mk.FusedMoEPrepareAndFinalize, moe: FusedMoEConfig, + layer: torch.nn.Module, ) -> mk.FusedMoEPermuteExpertsUnpermute: experts = select_nvfp4_gemm_impl( moe, diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index bdeb169a4b97f..6724796904f01 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -10,6 +10,8 @@ from vllm.config import get_current_vllm_config from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig, FusedMoEMethodBase) +from vllm.model_executor.layers.fused_moe import modular_kernel as mk +from vllm.model_executor.layers.fused_moe.trtllm_moe import TrtLlmGenExperts from vllm.model_executor.layers.linear import (LinearBase, UnquantizedLinearMethod) from vllm.model_executor.layers.quantization import QuantizationMethods @@ -445,6 +447,91 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): return tile_tokens_dim + def select_gemm_impl( + self, + prepare_finalize: mk.FusedMoEPrepareAndFinalize, + moe: FusedMoEConfig, + layer: torch.nn.Module, + ) -> mk.FusedMoEPermuteExpertsUnpermute: + if (prepare_finalize.activation_format == + mk.FusedMoEActivationFormat.BatchedExperts): + raise NotImplementedError( + "Mxfp4 does not support batched experts format for EP") + else: + if should_use_flashinfer_mxfp4(): + # B200 code-path + kwargs = { + "gemm1_alpha": layer.gemm1_alpha, + "gemm1_beta": layer.gemm1_beta, + "gemm1_clamp_limit": layer.gemm1_clamp_limit, + "w13_bias": layer.w13_bias, + "w2_bias": layer.w2_bias, + "max_capture_size": self.max_capture_size, + } + return TrtLlmGenExperts(moe, **kwargs) + else: + # Use matmul_ogs from triton_kernels here! + raise NotImplementedError( + "Mxfp4 does not support non-batched experts format for EP") + + def _route_and_experts( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: Optional[torch.Tensor] = None, + logical_to_physical_map: Optional[torch.Tensor] = None, + logical_replica_count: Optional[torch.Tensor] = None + ) -> torch.Tensor: + + assert isinstance(self.fused_experts, mk.FusedMoEModularKernel) + + topk_weights, topk_ids = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype, + enable_eplb=enable_eplb, + expert_map=expert_map, + expert_load_view=expert_load_view, + logical_to_physical_map=logical_to_physical_map, + logical_replica_count=logical_replica_count) + + return self.fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + activation=activation, + global_num_experts=global_num_experts, + expert_map=expert_map, + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + apply_router_weight_on_input=apply_router_weight_on_input, + ) + def apply( self, layer: torch.nn.Module, @@ -503,6 +590,29 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): activation=activation, expert_map=expert_map) + if self.fused_experts is not None: + return self._route_and_experts( + layer, + x, + router_logits, + top_k, + renormalize, + use_grouped_topk, + topk_group, + num_expert_group, + global_num_experts, + expert_map, + custom_routing_function, + scoring_func, + e_score_correction_bias, + apply_router_weight_on_input, + activation, + enable_eplb, + expert_load_view, + logical_to_physical_map, + logical_replica_count, + ) + assert _can_support_mxfp4( use_grouped_topk, topk_group, num_expert_group, expert_map, custom_routing_function, e_score_correction_bias, diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py index 48f9cc3737e47..3de928fea7202 100644 --- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py @@ -66,11 +66,10 @@ def _can_support_mxfp4(use_grouped_topk: bool = False, logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None): return not (use_grouped_topk or topk_group or num_expert_group - or expert_map or custom_routing_function - or e_score_correction_bias or apply_router_weight_on_input - or scoring_func != "softmax" or activation != "swigluoai" - or expert_load_view or logical_to_physical_map - or logical_replica_count) + or custom_routing_function or e_score_correction_bias + or apply_router_weight_on_input or scoring_func != "softmax" + or activation != "swigluoai" or expert_load_view + or logical_to_physical_map or logical_replica_count) def _dequant_mxfp4(x: torch.Tensor, scale: torch.Tensor, diff --git a/vllm/model_executor/layers/quantization/utils/mxfp8_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp8_utils.py new file mode 100644 index 0000000000000..2a6b21c918f46 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/mxfp8_utils.py @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +def mxfp8_quantize(x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + + try: + from flashinfer import mxfp8_quantize + except ImportError as err: + raise ImportError("The package `flashinfer` is required to do " + "MX-FP8 quantization. Please install it with" \ + "`pip install flashinfer`") from err + + return mxfp8_quantize(x, is_sf_swizzled_layout=False) From f9ca2b40a0357d98e3fb8bd951745dfaceab459e Mon Sep 17 00:00:00 2001 From: Michael Goin <mgoin64@gmail.com> Date: Wed, 27 Aug 2025 17:48:16 -0400 Subject: [PATCH 106/112] [Bugfix] Fix Marlin NVFP4 for modelopt (#23659) Signed-off-by: mgoin <mgoin64@gmail.com> --- .../layers/quantization/modelopt.py | 25 +++++++++---------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index adce598c4ff1f..9d4e453ffc545 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -891,7 +891,11 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase): assert (layer.weight_scale.dtype == torch.float8_e4m3fn), ( "Weight Block scale must be represented as FP8-E4M3") - if self.backend == "flashinfer-trtllm": + if self.backend == "marlin": + prepare_fp4_layer_for_marlin(layer) + del layer.alpha + del layer.input_scale + elif self.backend == "flashinfer-trtllm": # FlashInfer TRTLLM FP4 GEMM requires a different weight layout. # FlashInfer provides nvfp4_quantize to quantize + shuffle the # layout but we use our own quantization so we have to call @@ -916,11 +920,6 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase): requires_grad=False) layer.weight = Parameter(layer.weight.data, requires_grad=False) - if self.backend == "marlin": - prepare_fp4_layer_for_marlin(layer) - del layer.alpha - del layer.input_scale - def apply( self, layer: torch.nn.Module, @@ -1312,6 +1311,13 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): del layer.w2_weight_scale del layer.w13_weight del layer.w13_weight_scale + elif self.use_marlin: + # Marlin processing + prepare_moe_fp4_layer_for_marlin(layer) + del layer.g1_alphas + del layer.g2_alphas + del layer.w13_input_scale_quant + del layer.w2_input_scale_quant else: # Non-TRT-LLM processing (Cutlass or non-flashinfer) assert (layer.w13_weight_scale.shape[2] % 16 == 0), ( @@ -1333,13 +1339,6 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False) - if self.use_marlin: - prepare_moe_fp4_layer_for_marlin(layer) - del layer.g1_alphas - del layer.g2_alphas - del layer.w13_input_scale_quant - del layer.w2_input_scale_quant - def apply( self, layer: torch.nn.Module, From 321938e9ac4000e0cb37e328359a7fd3026bc672 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Wed, 27 Aug 2025 17:52:24 -0400 Subject: [PATCH 107/112] [Feature] Add `VLLM_DISABLE_PAD_FOR_CUDAGRAPH` to Avoid Hang Issue (#23595) Signed-off-by: yewentao256 <zhyanwentao@126.com> Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/envs.py | 7 +++++++ vllm/v1/worker/gpu_model_runner.py | 1 + 2 files changed, 8 insertions(+) diff --git a/vllm/envs.py b/vllm/envs.py index 35735b552575b..a6a795dcfcda9 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -166,6 +166,7 @@ if TYPE_CHECKING: VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False VLLM_ALLREDUCE_USE_SYMM_MEM: bool = False VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None + VLLM_DISABLE_PAD_FOR_CUDAGRAPH: bool = False def get_default_cache_root(): @@ -1144,6 +1145,12 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_ENABLE_CUDAGRAPH_GC": lambda: bool(int(os.getenv("VLLM_ENABLE_CUDAGRAPH_GC", "0"))), + # Disable padding to CUDA graph capture batch sizes. + # TODO(wentao): https://github.com/vllm-project/vllm/issues/23378 + # After the issue is fixed, we can remove this flag. + "VLLM_DISABLE_PAD_FOR_CUDAGRAPH": + lambda: bool(int(os.getenv("VLLM_DISABLE_PAD_FOR_CUDAGRAPH", "0"))), + # Used to force set up loopback IP "VLLM_LOOPBACK_IP": lambda: os.getenv("VLLM_LOOPBACK_IP", ""), diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 01c90b2ea38d3..a194808e513dd 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1491,6 +1491,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE + and not envs.VLLM_DISABLE_PAD_FOR_CUDAGRAPH and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]): # Use CUDA graphs. # Add padding to the batch size. From 5da4f5d857933329aaca779e3a81f1385c84e34a Mon Sep 17 00:00:00 2001 From: Hanchenli <61769611+Hanchenli@users.noreply.github.com> Date: Wed, 27 Aug 2025 17:44:52 -0700 Subject: [PATCH 108/112] [Bugfix] Fix for V1 priority scheduling crashes at preemption (#23713) Signed-off-by: Hanchenli <lihanc2002@gmail.com> --- tests/v1/core/test_scheduler.py | 91 +++++++++++++++++++++++++++++++-- vllm/v1/core/sched/scheduler.py | 2 + 2 files changed, 90 insertions(+), 3 deletions(-) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 07d7c12a4f5ef..70e8691788045 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -1293,7 +1293,8 @@ def create_requests_with_priority( mm_positions: Optional[list[list[PlaceholderRange]]] = None, max_tokens: int = 16, stop_token_ids: Optional[list[int]] = None, - prompt_logprobs: Optional[int] = None): + prompt_logprobs: Optional[int] = None, + starting_idx: int = 0): """Create requests with specified priorities and arrival times.""" assert len(priorities) == num_requests if arrival_times is not None: @@ -1315,8 +1316,8 @@ def create_requests_with_priority( mm_position = None mm_kwargs = None request = Request( - request_id=f"{i}", - prompt_token_ids=[i] * num_tokens, + request_id=f"{i + starting_idx}", + prompt_token_ids=[i + starting_idx] * num_tokens, sampling_params=sampling_params, pooling_params=None, multi_modal_kwargs=mm_kwargs, @@ -1813,3 +1814,87 @@ def test_schedule_skip_tokenizer_init_structured_output_request(): assert len(output.scheduled_new_reqs) == 0 assert len(scheduler.running) == 0 assert len(scheduler.waiting) == 1 + + +def test_priority_scheduling_preemption_when_out_of_kv(): + """Test that priority scheduling preempts lower priority requests + when out of KV cache space.""" + # Create scheduler with very limited memory to force preemption + scheduler = create_scheduler_with_priority( + max_num_seqs=2, # Allow multiple requests + max_num_batched_tokens=200, + num_blocks=5, # Can hold 64 tokens (first block is null) + block_size=16, # Standard block size + ) + + # Create a request and schedule it + request_low = create_requests_with_priority( + num_requests=1, + priorities=[1], + arrival_times=[0.0], + num_tokens=30, + starting_idx=0, + )[0] + scheduler.add_request(request_low) + output = scheduler.schedule() + assert len(output.scheduled_new_reqs) == 1 + assert len(scheduler.waiting) == 0 + assert len(scheduler.running) == 1 + + # Simulate model execution + model_output = ModelRunnerOutput( + req_ids=[request_low.request_id], + req_id_to_index={request_low.request_id: 0}, + sampled_token_ids=[[100]], + # spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[], + ) + scheduler.update_from_output(output, model_output) + + # Create a high priority request and schedule it + request_high = create_requests_with_priority( + num_requests=1, + priorities=[0], + arrival_times=[1.0], + num_tokens=32, + starting_idx=1, + )[0] + scheduler.add_request(request_high) + output = scheduler.schedule() + # KV cache should be full at this point + assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == 0 + assert len(output.scheduled_new_reqs) == 1 + assert output.scheduled_cached_reqs.num_reqs == 1 + assert len(scheduler.waiting) == 0 + assert len(scheduler.running) == 2 + + # Simulate model execution + requests = [request_low, request_high] + model_output = ModelRunnerOutput( + req_ids=[req.request_id for req in requests], + req_id_to_index={ + req.request_id: i + for i, req in enumerate(requests) + }, + sampled_token_ids=[[100] for _ in requests], + # spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[], + ) + scheduler.update_from_output(output, model_output) + + # Schedule again - this should trigger preemption + # req_low needs 32 tokens = 2 blocks + # req_high needs 33 tokens = 3 blocks + # so doesn't fit in 4 blocks. + output = scheduler.schedule() + + # Should have preempted req_low + assert len(output.scheduled_new_reqs) == 0 + assert output.scheduled_cached_reqs.num_reqs == 1 + assert output.scheduled_cached_reqs.req_ids[0] == request_high.request_id + assert len(scheduler.waiting) == 1 + assert len(scheduler.running) == 1 \ No newline at end of file diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 14a914d8f2f0b..3bd2fe2f0515f 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -253,6 +253,8 @@ class Scheduler(SchedulerInterface): key=lambda r: (r.priority, r.arrival_time), ) self.running.remove(preempted_req) + if preempted_req in scheduled_running_reqs: + scheduled_running_reqs.remove(preempted_req) else: preempted_req = self.running.pop() From a69693e38f27f12e5a5d05b6792e590b520ca27b Mon Sep 17 00:00:00 2001 From: Benji Beck <benjibeck@meta.com> Date: Wed, 27 Aug 2025 19:43:26 -0700 Subject: [PATCH 109/112] Migrate Qwen inputs to TensorSchema (#23473) Signed-off-by: Benji Beck <benjibeck@meta.com> --- vllm/model_executor/models/qwen_vl.py | 51 +++++++++++++-------------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py index 2950ca664a98f..90200f319464b 100644 --- a/vllm/model_executor/models/qwen_vl.py +++ b/vllm/model_executor/models/qwen_vl.py @@ -11,7 +11,7 @@ import math import unicodedata from collections.abc import Collection, Mapping, Sequence, Set from functools import lru_cache, partial -from typing import Callable, Literal, Optional, TypedDict, Union +from typing import Annotated, Callable, Literal, Optional, Union import regex as re import torch @@ -40,6 +40,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, PromptUpdate, PromptUpdateDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors +from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal, SupportsPP) @@ -47,26 +48,34 @@ from .qwen import QWenBaseModel, QWenModel from .utils import flatten_bn, merge_multimodal_embeddings -class QwenImagePixelInputs(TypedDict): - type: Literal["pixel_values"] - data: torch.Tensor +class QwenImagePixelInputs(TensorSchema): """ - Shape: `(batch_size * num_images, 3, image_size, image_size)` - + Dimensions: + - bn: Batch size * number of images + - c: Number of channels (3) + - h: Height + - w: Width + Note that image_size is the value in the vision config to which we resize the image to in the normalization transform. Currently multi-image support can only be leveraged by passing image embeddings directly. """ + type: Literal["pixel_values"] = "pixel_values" + data: Annotated[torch.Tensor, TensorShape("bn", 3, "h", "w")] -class QwenImageEmbeddingInputs(TypedDict): - type: Literal["image_embeds"] - data: torch.Tensor - """Shape: `(batch_size * num_images, 256, hidden_size)` - +class QwenImageEmbeddingInputs(TensorSchema): + """ + Dimensions: + - bn: Batch size * number of images + - ifs: Image feature size (256) + - hs: Hidden size + `hidden_size` must match the hidden size of the language model backbone and is stored in the visual config of the model if we have one. """ + type: Literal["image_embeds"] = "image_embeds" + data: Annotated[torch.Tensor, TensorShape("bn", 256, "hs")] QwenImageInputs = Union[QwenImagePixelInputs, QwenImageEmbeddingInputs] @@ -697,19 +706,6 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA, self.transformer: QwenVLModel - def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: - h = w = self.config.visual["image_size"] - expected_dims = (3, h, w) - actual_dims = tuple(data.shape[1:]) - - if actual_dims != expected_dims: - expected_expr = ("batch_size", *map(str, expected_dims)) - raise ValueError( - f"The expected shape of pixel values is {expected_expr}. " - f"You supplied {tuple(data.shape)}.") - - return data - def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[QwenImageInputs]: pixel_values = kwargs.pop("pixel_values", None) @@ -720,10 +716,13 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA, raise ValueError("Incorrect type of pixel values. " f"Got type: {type(pixel_values)}") + expected_h = expected_w = self.config.visual["image_size"] + resolve_bindings = {"h": expected_h, "w": expected_w} + return QwenImagePixelInputs( type="pixel_values", - data=self._validate_pixel_values( - flatten_bn(pixel_values, concat=True)), + data=flatten_bn(pixel_values, concat=True), + resolve_bindings=resolve_bindings, ) if image_embeds is not None: From 1b7b161a09289214eea41e17895a68a7ccd4b1dc Mon Sep 17 00:00:00 2001 From: Shrey Gupta <66182248+Shrey1306@users.noreply.github.com> Date: Thu, 28 Aug 2025 08:42:44 +0530 Subject: [PATCH 110/112] [Feature] models: pass layer prefix to replace_linear_class for per-layer quantization routing. Addresses #23239 (#23556) Signed-off-by: Shrey Gupta <shreyg1303@gmail.com> --- vllm/model_executor/models/deepseek_vl2.py | 12 ++++++++---- vllm/model_executor/models/transformers.py | 14 ++++++++++---- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 1bd2802a86838..5eab02b17151c 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -408,13 +408,17 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): if isinstance(module, nn.Linear): parent, attr_name = self._get_parent_and_attr(vit, name) if isinstance(parent, timm.layers.Mlp) and attr_name == "fc1": - new_linear = replace_linear_class(module, "colwise", - quant_config) + new_linear = replace_linear_class(module, + "colwise", + quant_config, + prefix=name) setattr(parent, attr_name, new_linear) elif isinstance(parent, timm.layers.Mlp) and attr_name == "fc2": - new_linear = replace_linear_class(module, "rowwise", - quant_config) + new_linear = replace_linear_class(module, + "rowwise", + quant_config, + prefix=name) setattr(parent, attr_name, new_linear) return vit diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index edf3dddb1bad2..f7ced6134da52 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -106,8 +106,11 @@ def can_enable_torch_compile(vllm_config: VllmConfig) -> bool: def replace_linear_class( - linear: nn.Linear, style: Literal["colwise", "rowwise"], - quant_config: QuantizationConfig + linear: nn.Linear, + style: Literal["colwise", "rowwise"], + quant_config: QuantizationConfig, + *, + prefix: str = "", ) -> Union[ColumnParallelLinear, RowParallelLinear, ReplicatedLinear]: """ Replace nn.Linear with one of vLLM's tensor parallel linear classes. @@ -141,6 +144,7 @@ def replace_linear_class( output_size=linear.out_features, bias=linear.bias is not None, quant_config=quant_config, + prefix=prefix, return_bias=False, **vllm_linear_kwargs, ) @@ -557,8 +561,10 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP): generator = (p for p in tp_plan if re.match(p, qual_name)) pattern = next(generator, None) style = tp_plan.get(pattern, "replicate") - new_module = replace_linear_class(child_module, style, - self.quant_config) + new_module = replace_linear_class(child_module, + style, + self.quant_config, + prefix=qual_name) setattr(module, child_name, new_module) log_replacement(qual_name, child_module, new_module) else: From a781e84ec25b1d1b6c245f2e8ffec6e10bafdaa1 Mon Sep 17 00:00:00 2001 From: Michael Goin <mgoin64@gmail.com> Date: Wed, 27 Aug 2025 23:12:53 -0400 Subject: [PATCH 111/112] [Perf] Tune configs for triton block fp8 gemm H100/H200 (#23748) Signed-off-by: mgoin <mgoin64@gmail.com> --- benchmarks/kernels/bench_block_fp8_gemm.py | 113 ++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 90 +++++------ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 82 +++++----- ...,dtype=fp8_w8a8,block_shape=[128,128].json | 62 ++++---- ...,dtype=fp8_w8a8,block_shape=[128,128].json | 54 +++---- ...,dtype=fp8_w8a8,block_shape=[128,128].json | 82 +++++----- ...,dtype=fp8_w8a8,block_shape=[128,128].json | 84 +++++----- ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 118 +++++++------- ...,dtype=fp8_w8a8,block_shape=[128,128].json | 132 ++++++++-------- ...,dtype=fp8_w8a8,block_shape=[128,128].json | 82 +++++----- ...,dtype=fp8_w8a8,block_shape=[128,128].json | 76 ++++----- ...,dtype=fp8_w8a8,block_shape=[128,128].json | 60 +++---- ...,dtype=fp8_w8a8,block_shape=[128,128].json | 100 ++++++------ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 108 ++++++------- ...,dtype=fp8_w8a8,block_shape=[128,128].json | 76 ++++----- 21 files changed, 1592 insertions(+), 603 deletions(-) create mode 100644 benchmarks/kernels/bench_block_fp8_gemm.py create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/benchmarks/kernels/bench_block_fp8_gemm.py b/benchmarks/kernels/bench_block_fp8_gemm.py new file mode 100644 index 0000000000000..883f0cf7e55f1 --- /dev/null +++ b/benchmarks/kernels/bench_block_fp8_gemm.py @@ -0,0 +1,113 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + w8a8_block_fp8_matmul, +) +from vllm.platforms import current_platform +from vllm.triton_utils import triton as vllm_triton + +assert current_platform.is_cuda(), ( + "Only support benchmarking w8a8 block fp8 kernel on CUDA device." +) + +# DeepSeek-V3 weight shapes +DEEPSEEK_V3_SHAPES = [ + (512 + 64, 7168), + ((128 + 64) * 128, 7168), + (128 * (128 + 128), 512), + (7168, 16384), + (7168, 18432), + (18432 * 2, 7168), + (24576, 1536), + (12288, 7168), + (4096, 7168), + (7168, 2048), +] + + +def build_w8a8_block_fp8_runner(M, N, K, block_size, device): + """Build runner function for w8a8 block fp8 matmul.""" + factor_for_scale = 1e-2 + + fp8_info = torch.finfo(torch.float8_e4m3fn) + fp8_max, fp8_min = fp8_info.max, fp8_info.min + + # Create random FP8 tensors + A_fp32 = (torch.rand(M, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max + A = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) + + B_fp32 = (torch.rand(N, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max + B = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) + + # Create scales + block_n, block_k = block_size[0], block_size[1] + n_tiles = (N + block_n - 1) // block_n + k_tiles = (K + block_k - 1) // block_k + + As = torch.rand(M, k_tiles, dtype=torch.float32, device=device) * factor_for_scale + Bs = ( + torch.rand(n_tiles, k_tiles, dtype=torch.float32, device=device) + * factor_for_scale + ) + + def run(): + return w8a8_block_fp8_matmul(A, B, As, Bs, block_size, torch.bfloat16) + + return run + + +@vllm_triton.testing.perf_report( + vllm_triton.testing.Benchmark( + x_names=["batch_size"], + x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384], + x_log=False, + line_arg="provider", + line_vals=["torch-bf16", "w8a8-block-fp8"], + line_names=["torch-bf16", "w8a8-block-fp8"], + ylabel="TFLOP/s (larger is better)", + plot_name="BF16 vs W8A8 Block FP8 GEMMs", + args={}, + ) +) +def benchmark_tflops(batch_size, provider, N, K, block_size=(128, 128)): + M = batch_size + device = "cuda" + + quantiles = [0.5, 0.2, 0.8] + + if provider == "torch-bf16": + a = torch.randn((M, K), device=device, dtype=torch.bfloat16) + b = torch.randn((N, K), device=device, dtype=torch.bfloat16) + ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph( + lambda: torch.nn.functional.linear(a, b), quantiles=quantiles + ) + else: # w8a8-block-fp8 + run_w8a8 = build_w8a8_block_fp8_runner(M, N, K, block_size, device) + ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph( + lambda: run_w8a8(), quantiles=quantiles + ) + + to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3) + return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms) + + +if __name__ == "__main__": + block_size = (128, 128) + + for N, K in DEEPSEEK_V3_SHAPES: + print(f"\nBenchmarking DeepSeek-V3, N={N} K={K}") + + print(f"TFLOP/s comparison (block_size={block_size}):") + benchmark_tflops.run( + print_data=True, + # show_plots=False, + # save_path=f"bench_w8a8_block_fp8_tflops_n{N}_k{K}", + N=N, + K=K, + block_size=block_size, + ) + + print("\nBenchmark finished!") diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..0ea0225c96af1 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..be487f2805b85 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..f74a52fc17c9d --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..8cab1b093276a --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json index 1c61451fb34e5..ae244f90bb064 100644 --- a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -1,73 +1,73 @@ { "1": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4 }, "2": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 256, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, - "num_warps": 8, - "num_stages": 4 - }, - "4": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 3 - }, - "8": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4, - "num_stages": 3 - }, - "16": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, - "24": { - "BLOCK_SIZE_M": 64, + "4": { + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, - "num_stages": 3 + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 }, "32": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "48": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 64, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "64": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3 }, @@ -75,7 +75,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3 }, @@ -83,7 +83,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3 }, @@ -91,7 +91,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3 }, @@ -107,7 +107,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3 }, @@ -115,15 +115,15 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 32, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "2048": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3 }, @@ -133,13 +133,13 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "4096": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3 } diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json index 63e661c80de6a..b2931d68f488a 100644 --- a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -1,83 +1,83 @@ { "1": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4 }, "2": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 256, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, - "num_warps": 8, - "num_stages": 4 - }, - "4": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, - "8": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4, - "num_stages": 3 - }, - "16": { - "BLOCK_SIZE_M": 64, + "4": { + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 }, "24": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 64, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "32": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "48": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "64": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "96": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "128": { "BLOCK_SIZE_M": 64, @@ -91,7 +91,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3 }, @@ -99,9 +99,9 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 64, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "1024": { "BLOCK_SIZE_M": 64, @@ -115,7 +115,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3 }, @@ -139,8 +139,8 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 } -} +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json index 56b939e52fac3..ad630f0d787cf 100644 --- a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -1,30 +1,30 @@ { "1": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, - "num_warps": 8, - "num_stages": 4 - }, - "2": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 256, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, - "num_warps": 8, - "num_stages": 4 - }, - "4": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 256, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3 }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, "8": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, @@ -32,19 +32,19 @@ "num_stages": 3 }, "16": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 3 + "num_stages": 2 }, "24": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, - "num_warps": 4, + "num_warps": 8, "num_stages": 3 }, "32": { @@ -59,9 +59,9 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 32, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "64": { "BLOCK_SIZE_M": 64, @@ -75,7 +75,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3 }, @@ -83,7 +83,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3 }, @@ -91,7 +91,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3 }, @@ -123,7 +123,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3 }, @@ -139,7 +139,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3 } diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json index 63d9a0bf5d79d..10b940c04fad3 100644 --- a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -1,50 +1,50 @@ { "1": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4 }, "2": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 256, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, - "num_warps": 8, - "num_stages": 4 - }, - "4": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3 }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, "8": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 3 + "num_stages": 2 }, "16": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, "num_warps": 4, - "num_stages": 3 + "num_stages": 5 }, "24": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4, + "GROUP_SIZE_M": 1, + "num_warps": 8, "num_stages": 3 }, "32": { @@ -59,15 +59,15 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 3 + "num_stages": 2 }, "64": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3 }, @@ -75,7 +75,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3 }, @@ -91,7 +91,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3 }, @@ -139,7 +139,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3 } diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json index 7fa398c15a2a5..94ce6e77f09ce 100644 --- a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +++ b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -1,55 +1,55 @@ { "1": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4, - "num_stages": 5 - }, - "2": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4, - "num_stages": 5 - }, - "4": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4, + "GROUP_SIZE_M": 1, + "num_warps": 8, "num_stages": 3 }, - "8": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 5 + "num_stages": 3 }, - "16": { - "BLOCK_SIZE_M": 64, + "4": { + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3 }, - "24": { - "BLOCK_SIZE_M": 64, + "8": { + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3 }, "32": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, @@ -59,31 +59,31 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "64": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "96": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "128": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3 }, @@ -99,7 +99,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3 }, @@ -107,7 +107,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3 }, @@ -123,7 +123,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3 }, @@ -131,7 +131,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3 }, diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json index f15d8f64c7090..9540df407975e 100644 --- a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +++ b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -1,57 +1,57 @@ { "1": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3 }, "2": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4, - "num_stages": 5 - }, - "4": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 5 + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 }, "8": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, - "num_stages": 5 + "num_stages": 3 }, "16": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3 }, "24": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3 }, "32": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3 }, @@ -59,33 +59,33 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "64": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "96": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4, - "num_stages": 3 - }, - "128": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 }, "256": { "BLOCK_SIZE_M": 64, @@ -93,23 +93,23 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "512": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 32, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "1024": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "1536": { "BLOCK_SIZE_M": 64, @@ -133,7 +133,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "4096": { "BLOCK_SIZE_M": 64, diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..96f6c307b357d --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..567675787d4f9 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json index 51e237b91b8e7..0894ff2fa3322 100644 --- a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -1,6 +1,6 @@ { "1": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, @@ -8,55 +8,55 @@ "num_stages": 5 }, "2": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, - "num_stages": 4 + "num_stages": 3 }, "4": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4 }, - "8": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4, - "num_stages": 4 - }, - "16": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, - "num_warps": 4, - "num_stages": 4 - }, "24": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 32, "num_warps": 4, - "num_stages": 4 + "num_stages": 5 }, "32": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4 }, "48": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, @@ -64,83 +64,83 @@ "num_stages": 4 }, "64": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 4 + "num_stages": 3 }, "96": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5 }, "128": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4 }, "256": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 32, "num_warps": 4, - "num_stages": 4 + "num_stages": 3 }, "512": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 4 + "num_stages": 5 }, "1024": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4, - "num_stages": 4 - }, - "1536": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 4 - }, - "2048": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3 }, - "3072": { + "1536": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, + "num_stages": 5 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, "num_stages": 3 }, "4096": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 } } diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json index 6280219c9ee7d..86c68e08a1a6a 100644 --- a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -1,78 +1,78 @@ { "1": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4 }, "2": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 64, "num_warps": 4, - "num_stages": 4 + "num_stages": 3 }, "4": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 64, "num_warps": 4, - "num_stages": 4 + "num_stages": 5 }, "8": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 4 + "num_stages": 5 }, "16": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 4 + "num_stages": 3 }, "24": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4 }, "32": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 4 - }, - "48": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 4 - }, - "64": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, - "num_stages": 4 + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 }, "96": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, @@ -80,38 +80,14 @@ "num_stages": 5 }, "128": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, - "num_warps": 4, - "num_stages": 4 - }, - "256": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, - "num_stages": 4 + "num_stages": 3 }, - "512": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4, - "num_stages": 4 - }, - "1024": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, - "num_warps": 4, - "num_stages": 4 - }, - "1536": { + "256": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, @@ -119,19 +95,43 @@ "num_warps": 4, "num_stages": 5 }, - "2048": { + "512": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3 }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, "3072": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3 }, @@ -139,7 +139,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3 } diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json index 0a1e14cffbb2a..af1a384cbcbd3 100644 --- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -1,14 +1,14 @@ { "1": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5 }, "2": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, @@ -16,26 +16,26 @@ "num_stages": 5 }, "4": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5 }, "8": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5 }, "16": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5 }, @@ -43,9 +43,9 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 5 + "num_stages": 4 }, "32": { "BLOCK_SIZE_M": 64, @@ -59,7 +59,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4 }, @@ -67,31 +67,31 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5 }, "96": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 32, "num_warps": 4, - "num_stages": 4 + "num_stages": 5 }, "128": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 64, "num_warps": 4, - "num_stages": 5 + "num_stages": 4 }, "256": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3 }, @@ -101,25 +101,9 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "1024": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, - "num_warps": 4, - "num_stages": 3 - }, - "1536": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, - "num_warps": 4, - "num_stages": 3 - }, - "2048": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, @@ -127,13 +111,29 @@ "num_warps": 4, "num_stages": 3 }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, "3072": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "4096": { "BLOCK_SIZE_M": 64, @@ -141,6 +141,6 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 } } diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json index 15b1c93f60fc5..d381764a26414 100644 --- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -1,22 +1,22 @@ { "1": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5 }, - "2": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 5 - }, "4": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, @@ -24,18 +24,18 @@ "num_stages": 5 }, "8": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 4 + "num_stages": 5 }, "16": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5 }, @@ -45,47 +45,47 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, - "num_stages": 4 + "num_stages": 5 }, "32": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 32, "num_warps": 4, - "num_stages": 4 + "num_stages": 5 }, "48": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 32, "num_warps": 4, - "num_stages": 4 + "num_stages": 5 }, "64": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 4 + "num_stages": 5 }, "96": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5 }, "128": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 5 + "num_stages": 4 }, "256": { "BLOCK_SIZE_M": 64, @@ -93,29 +93,29 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "512": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 32, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "1024": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 32, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "1536": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3 }, @@ -123,7 +123,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3 }, @@ -133,7 +133,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "4096": { "BLOCK_SIZE_M": 64, diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json index 8ff12e64c172f..821ad0c704573 100644 --- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -1,43 +1,43 @@ { "1": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4, + "GROUP_SIZE_M": 16, + "num_warps": 8, "num_stages": 5 }, "2": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5 }, "4": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5 }, "8": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5 }, "16": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 4 + "num_stages": 5 }, "24": { "BLOCK_SIZE_M": 64, @@ -45,7 +45,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 5 + "num_stages": 4 }, "32": { "BLOCK_SIZE_M": 64, @@ -59,7 +59,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5 }, @@ -73,19 +73,19 @@ }, "96": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 32, "num_warps": 4, - "num_stages": 5 + "num_stages": 4 }, "128": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 5 + "num_stages": 4 }, "256": { "BLOCK_SIZE_M": 64, @@ -99,21 +99,21 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3 }, "1024": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3 }, "1536": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, @@ -123,9 +123,9 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 32, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "3072": { "BLOCK_SIZE_M": 64, @@ -133,7 +133,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "4096": { "BLOCK_SIZE_M": 64, diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json index 4532f93681e2b..daaf21c286553 100644 --- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -1,67 +1,67 @@ { "1": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5 }, - "2": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 5 - }, - "4": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 5 - }, "8": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4, - "num_stages": 4 - }, - "16": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 4 + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 }, "24": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 32, "num_warps": 4, - "num_stages": 4 + "num_stages": 5 }, "32": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 4 + "num_stages": 5 }, "48": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 4 + "num_stages": 5 }, "64": { "BLOCK_SIZE_M": 64, @@ -73,25 +73,25 @@ }, "96": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 64, "num_warps": 4, - "num_stages": 5 + "num_stages": 4 }, "128": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 5 + "num_stages": 4 }, "256": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3 }, @@ -99,31 +99,31 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 64, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "1024": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "1536": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "2048": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3 }, @@ -133,7 +133,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "4096": { "BLOCK_SIZE_M": 64, @@ -141,6 +141,6 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 } } diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json index ca7f32b9552b4..2583b5a3441ca 100644 --- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -1,57 +1,57 @@ { "1": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, - "num_warps": 8, - "num_stages": 5 - }, - "2": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4, - "num_stages": 5 - }, - "4": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 5 + "num_warps": 8, + "num_stages": 3 }, - "8": { - "BLOCK_SIZE_M": 64, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 5 + "num_stages": 3 }, - "16": { - "BLOCK_SIZE_M": 64, + "8": { + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4 }, "24": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4 }, "32": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4 }, @@ -59,43 +59,35 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 32, "num_warps": 4, - "num_stages": 4 + "num_stages": 3 }, "64": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4 }, "96": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4 }, "128": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4, - "num_stages": 3 - }, - "256": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, - "512": { + "256": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, @@ -103,19 +95,27 @@ "num_warps": 4, "num_stages": 3 }, - "1024": { + "512": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3 }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, "1536": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3 }, @@ -123,7 +123,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3 }, @@ -131,7 +131,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3 }, @@ -139,8 +139,8 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 32, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 } } diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json index 5acea242cc0ad..baa64f8d3d141 100644 --- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -1,65 +1,65 @@ { "1": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 32, "num_warps": 8, - "num_stages": 4 + "num_stages": 5 }, "2": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, - "num_stages": 5 + "num_stages": 3 }, "4": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, - "num_stages": 5 + "num_stages": 3 }, - "8": { - "BLOCK_SIZE_M": 64, + "16": { + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 4 - }, - "16": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4, - "num_stages": 4 + "num_stages": 3 }, "24": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 4 + "num_stages": 5 }, "32": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 4 + "num_stages": 3 }, "48": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4 }, @@ -69,21 +69,21 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 4 + "num_stages": 3 }, "96": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 4 + "num_stages": 5 }, "128": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4 }, @@ -91,7 +91,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3 }, @@ -99,13 +99,13 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3 }, "1024": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, @@ -115,7 +115,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3 }, @@ -123,7 +123,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3 }, @@ -131,15 +131,15 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 64, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "4096": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3 } From a11adafdcab61c059d2a76d952367a722e1b71d5 Mon Sep 17 00:00:00 2001 From: Jan Kessler <Ithanil@users.noreply.github.com> Date: Thu, 28 Aug 2025 05:14:00 +0200 Subject: [PATCH 112/112] Gracefully handle edge cases in harmony utils (#23155) Signed-off-by: Jan Kessler <jakessle@uni-mainz.de> Co-authored-by: Chen Zhang <zhangch99@outlook.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk> --- vllm/entrypoints/harmony_utils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py index bc810f683f4a4..078d316844257 100644 --- a/vllm/entrypoints/harmony_utils.py +++ b/vllm/entrypoints/harmony_utils.py @@ -155,7 +155,7 @@ def parse_chat_input(chat_msg) -> Message: contents = [TextContent(text=content)] else: # TODO: Support refusal. - contents = [TextContent(text=c["text"]) for c in content] + contents = [TextContent(text=c.get("text", "")) for c in content] msg = Message.from_role_and_contents(role, contents) return msg @@ -218,8 +218,8 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]: ) output_items.append(reasoning_item) elif message.channel == "commentary": - if message.recipient.startswith("functions."): - function_name = message.recipient.split(".")[-1] + if recipient is not None and recipient.startswith("functions."): + function_name = recipient.split(".")[-1] for content in message.content: random_id = random_uuid() response_item = ResponseFunctionToolCall( @@ -230,8 +230,8 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]: id=f"ft_{random_id}", ) output_items.append(response_item) - elif message.recipient.startswith( - "python") or message.recipient.startswith("browser"): + elif recipient is not None and (recipient.startswith("python") + or recipient.startswith("browser")): for content in message.content: reasoning_item = ResponseReasoningItem( id=f"rs_{random_uuid()}", @@ -245,7 +245,7 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]: ) output_items.append(reasoning_item) else: - raise ValueError(f"Unknown recipient: {message.recipient}") + raise ValueError(f"Unknown recipient: {recipient}") elif message.channel == "final": contents = [] for content in message.content: