From f32a5bc5058afc2fb601dcb456b581e2fefa94dd Mon Sep 17 00:00:00 2001 From: Benji Beck Date: Thu, 28 Aug 2025 10:29:37 -0700 Subject: [PATCH 01/56] Migrate Llama4ImagePatchInputs to TensorSchema (#22021) Signed-off-by: Benji Beck --- vllm/model_executor/models/mllama4.py | 41 +++++++++++++++------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index ac9b968f7a0cd..ecbbb5f57bec8 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -19,7 +19,7 @@ import math from collections.abc import Iterable, Mapping from itertools import tee -from typing import Literal, Optional, TypedDict, Union +from typing import Annotated, Literal, Optional, Union import torch from torch import nn @@ -53,6 +53,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.multimodal.utils import run_dp_sharded_vision_model from vllm.sequence import IntermediateTensors +from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .llama4 import Llama4ForCausalLM @@ -60,28 +61,34 @@ from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix, merge_multimodal_embeddings) -class Llama4ImagePatchInputs(TypedDict): - type: Literal["pixel_values"] - flat_data: torch.Tensor +class Llama4ImagePatchInputs(TensorSchema): """ - Shape: - `(batch_size * num_chunks, num_channels, image size, image size)` + Dimensions: + - batch_size: Batch size + - total_num_chunks: Batch size * number of chunks + - num_channels: Number of channels + - image_size: Size of each image """ - patches_per_image: torch.Tensor + + type: Literal["pixel_values"] = "pixel_values" + + flat_data: Annotated[torch.Tensor, + TensorShape("total_num_chunks", "num_channels", + "image_size", "image_size")] + + patches_per_image: Annotated[torch.Tensor, TensorShape("batch_size")] """ The number of total patches for each image in the batch. - + This is used to split the embeddings which has the first two dimensions flattened just like `flat_data`. """ - aspect_ratios: Union[torch.Tensor, list[torch.Tensor]] + aspect_ratios: Annotated[torch.Tensor, TensorShape("batch_size", 2)] """ A list of aspect ratios corresponding to the number of tiles in each dimension that each image in the batch corresponds to. - - Shape: - `(batch_size, ratio)` where ratio is a pair `(ratio_h, ratio_w)` + Each aspect ratio is a pair (ratio_h, ratio_w). """ @@ -623,7 +630,7 @@ class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo] for (r_h, r_w) in aspect_ratios ] - processed_outputs["aspect_ratios"] = aspect_ratios + processed_outputs["aspect_ratios"] = torch.tensor(aspect_ratios) processed_outputs["patches_per_image"] = torch.tensor( patches_per_image) @@ -770,11 +777,9 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal, # TODO: confirm handling for variable lengths flat_pixel_values = flatten_bn(pixel_values, concat=True) patches_per_image = flatten_bn(kwargs.pop("patches_per_image")) - - aspect_ratios = kwargs.pop("aspect_ratios", None) - if not isinstance(aspect_ratios, (torch.Tensor, list)): - raise ValueError("Incorrect type of aspect_ratios. " - f"Got type: {type(aspect_ratios)}") + aspect_ratios = kwargs.pop("aspect_ratios") + if aspect_ratios.ndim == 3: + aspect_ratios = aspect_ratios.squeeze(1) return Llama4ImagePatchInputs( type="pixel_values", From 04d1dd7f4a444a61ae4b01ea0271490082dbd605 Mon Sep 17 00:00:00 2001 From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com> Date: Thu, 28 Aug 2025 13:18:08 -0500 Subject: [PATCH 02/56] [ROCm][Aiter] Add triton fp8 bmm kernel for mla (#23264) Signed-off-by: Divakar Verma Co-authored-by: ShaoChunLee --- vllm/envs.py | 8 ++ vllm/v1/attention/backends/mla/common.py | 108 ++++++++++++++++++++--- 2 files changed, 104 insertions(+), 12 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index a6a795dcfcda9..1232bd7bf9635 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -99,6 +99,7 @@ if TYPE_CHECKING: VLLM_ROCM_USE_AITER_RMSNORM: bool = True VLLM_ROCM_USE_AITER_MLA: bool = True VLLM_ROCM_USE_AITER_MHA: bool = True + VLLM_ROCM_USE_AITER_FP8BMM: bool = True VLLM_ROCM_USE_SKINNY_GEMM: bool = True VLLM_ROCM_FP8_PADDING: bool = True VLLM_ROCM_MOE_PADDING: bool = True @@ -774,6 +775,12 @@ environment_variables: dict[str, Callable[[], Any]] = { lambda: (os.getenv("VLLM_ROCM_USE_AITER_MHA", "True").lower() in ("true", "1")), + # Whether to use aiter triton fp8 bmm kernel + # By default is enabled. + "VLLM_ROCM_USE_AITER_FP8BMM": + lambda: (os.getenv("VLLM_ROCM_USE_AITER_FP8BMM", "True").lower() in + ("true", "1")), + # use rocm skinny gemms "VLLM_ROCM_USE_SKINNY_GEMM": lambda: (os.getenv("VLLM_ROCM_USE_SKINNY_GEMM", "True").lower() in @@ -1272,6 +1279,7 @@ def compute_hash() -> str: "VLLM_ROCM_USE_AITER_RMSNORM", "VLLM_ROCM_USE_AITER_MLA", "VLLM_ROCM_USE_AITER_MHA", + "VLLM_ROCM_USE_AITER_FP8BMM", "VLLM_ROCM_USE_SKINNY_GEMM", "VLLM_ROCM_FP8_PADDING", "VLLM_ROCM_MOE_PADDING", diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index ce45b34f64355..9f93b50b075b4 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -193,6 +193,7 @@ from dataclasses import dataclass, field from typing import ClassVar, Generic, Optional, TypeVar, Union import torch +from tqdm import tqdm import vllm.envs as envs from vllm import _custom_ops as ops @@ -203,6 +204,7 @@ from vllm.attention.backends.utils import get_mla_dims from vllm.attention.ops.merge_attn_states import merge_attn_states from vllm.attention.utils.fa_utils import get_flash_attn_version from vllm.config import VllmConfig +from vllm.distributed.parallel_state import is_global_first_rank from vllm.logger import init_logger from vllm.model_executor.layers.linear import (ColumnParallelLinear, LinearBase, @@ -234,6 +236,28 @@ try: except ImportError: flashinfer_available = False + +def is_rocm_aiter_fp8bmm_enabled() -> bool: + return current_platform.is_rocm() \ + and envs.VLLM_ROCM_USE_AITER_FP8BMM \ + and envs.VLLM_ROCM_USE_AITER + + +if is_rocm_aiter_fp8bmm_enabled(): + from aiter.ops.triton.batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant import ( # noqa: E501 # isort: skip + batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant + as aiter_triton_fp8_bmm) + + def dynamic_per_batched_tensor_quant( + x: torch.Tensor, dtype: torch.dtype = torch.float8_e4m3fn): + DTYPE_MAX = torch.finfo(dtype).max + min_val, max_val = x.aminmax() + amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-10) + scale = DTYPE_MAX / amax + x_scl_sat = (x * scale).clamp(min=-DTYPE_MAX, max=DTYPE_MAX) + return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal() + + logger = init_logger(__name__) CUDNN_WORKSPACE_SIZE = 12800 @@ -945,10 +969,21 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): def _v_up_proj(self, x): # Convert from (B, N, L) to (N, B, L) x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1) - # Multiply (N, B, L) x (N, L, V) -> (N, B, V) - x = torch.bmm(x, self.W_UV) - # Convert from (N, B, V) to (B, N * V) - return x.transpose(0, 1).reshape(-1, self.num_heads * self.v_head_dim) + if is_rocm_aiter_fp8bmm_enabled(): + # Multiply + Transpose (N, B, L) x (N, L, V)->(N, B, V)->(B, N, V) + x = aiter_triton_fp8_bmm(x, + self.W_V, + self.W_V_scale, + group_size=128, + transpose_bm=True) + # Convert from (B, N, V) to (B, N * V) + x = x.reshape(-1, self.num_heads * self.v_head_dim) + else: + # Multiply (N, B, L) x (N, L, V) -> (N, B, V) + x = torch.bmm(x, self.W_UV) + # Convert from (N, B, V) to (B, N * V) + x = x.transpose(0, 1).reshape(-1, self.num_heads * self.v_head_dim) + return x def process_weights_after_loading(self, act_dtype: torch.dtype): @@ -996,10 +1031,50 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): W_UK, W_UV = kv_b_proj_weight.split( [self.qk_nope_head_dim, self.v_head_dim], dim=-1) - # Convert from (L, N, V) to (N, L, V) - self.W_UV = W_UV.transpose(0, 1) - # Convert from (L, N, P) to (N, P, L) - self.W_UK_T = W_UK.permute(1, 2, 0) + if is_rocm_aiter_fp8bmm_enabled(): + W_K = W_UK.transpose(0, 1) # 16 512 128 + W_V = W_UV.permute(1, 2, 0) # 16 128 512 + self.W_K, self.W_K_scale = dynamic_per_batched_tensor_quant( + W_K, dtype=current_platform.fp8_dtype()) + self.W_V, self.W_V_scale = dynamic_per_batched_tensor_quant( + W_V, dtype=current_platform.fp8_dtype()) + + # The kernel operates on non-padded inputs. Hence, pre-compiling + # triton kernel to avoid runtime compilation for unseen batch sizes + # Pre-compile for batch sizes 1 to 1024 to cover most use-cases. + # On DS-R1, this step adds roughly 50s to the model loading time. + max_batch_size = 1024 # [ToDo] Find the optimal upper limit + pre_compilation_list = list(range(1, max_batch_size + 1)) + if is_global_first_rank(): + pre_compilation_list = tqdm( + pre_compilation_list, + desc="[Aiter Triton] Pre-compiling fp8 BMM kernel", + total=max_batch_size, + ) + + for m in pre_compilation_list: + x = torch.empty((self.W_K.shape[0], m, self.W_K.shape[2]), + dtype=torch.bfloat16, + device=self.W_K.device) + aiter_triton_fp8_bmm(x, + self.W_K, + self.W_K_scale, + group_size=128, + transpose_bm=True) + + x = torch.empty((self.W_V.shape[0], m, self.W_V.shape[2]), + dtype=torch.bfloat16, + device=self.W_V.device) + aiter_triton_fp8_bmm(x, + self.W_V, + self.W_V_scale, + group_size=128, + transpose_bm=True) + else: + # Convert from (L, N, V) to (N, L, V) + self.W_UV = W_UV.transpose(0, 1) + # Convert from (L, N, P) to (N, P, L) + self.W_UK_T = W_UK.permute(1, 2, 0) def _compute_prefill_context( self, @@ -1203,10 +1278,19 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) # Convert from (B, N, P) to (N, B, P) decode_q_nope = decode_q_nope.transpose(0, 1) - # Multiply (N, B, P) x (N, P, L) -> (N, B, L) - decode_ql_nope = torch.bmm(decode_q_nope, self.W_UK_T) - # Convert from (N, B, L) to (B, N, L) - decode_ql_nope = decode_ql_nope.transpose(0, 1) + + if is_rocm_aiter_fp8bmm_enabled(): + # Multiply+Transpose (N, B, P)x(N, P, L)->(N, B, L)->(B, N, L) + decode_ql_nope = aiter_triton_fp8_bmm(decode_q_nope, + self.W_K, + self.W_K_scale, + group_size=128, + transpose_bm=True) + else: + # Multiply (N, B, P) x (N, P, L) -> (N, B, L) + decode_ql_nope = torch.bmm(decode_q_nope, self.W_UK_T) + # Convert from (N, B, L) to (B, N, L) + decode_ql_nope = decode_ql_nope.transpose(0, 1) if fp8_attention: ql_nope_shape = decode_ql_nope.shape From 57d4ede520b6071341ebd310c0ddd4c6f4d54917 Mon Sep 17 00:00:00 2001 From: Jingkai He Date: Fri, 29 Aug 2025 03:05:20 +0800 Subject: [PATCH 03/56] [bugfix] [spec-decoding] fix data race in sample_recovered_tokens_kernel (vLLM v1) (#23829) Signed-off-by: He-Jingkai --- vllm/v1/sample/rejection_sampler.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py index 511cdb3234253..3d5e59addfcfa 100644 --- a/vllm/v1/sample/rejection_sampler.py +++ b/vllm/v1/sample/rejection_sampler.py @@ -598,17 +598,10 @@ def sample_recovered_tokens_kernel( vocab_offset = tl.arange(0, PADDED_VOCAB_SIZE) if NO_DRAFT_PROBS: draft_token_id = tl.load(draft_token_ids_ptr + start_idx + pos) - orig_prob = tl.load(target_probs_ptr + (start_idx + pos) * vocab_size + - draft_token_id) - # Temporarily zero out the probability of the draft token. - # This is essentially the same as target_prob - draft_prob, except that - # n-gram does not have draft_prob. We regard it as 1. - tl.store( - target_probs_ptr + (start_idx + pos) * vocab_size + draft_token_id, - 0) prob = tl.load(target_probs_ptr + (start_idx + pos) * vocab_size + vocab_offset, - mask=vocab_offset < vocab_size, + mask=((vocab_offset < vocab_size) & + (vocab_offset != draft_token_id)), other=0) else: draft_prob = tl.load(draft_probs_ptr + (start_idx + pos) * vocab_size + @@ -628,9 +621,3 @@ def sample_recovered_tokens_kernel( other=float("-inf")) recovered_id = tl.argmax(prob / q, axis=-1) tl.store(output_token_ids_ptr + start_idx + pos, recovered_id) - - if NO_DRAFT_PROBS: - # Restore the original probability. - tl.store( - target_probs_ptr + (start_idx + pos) * vocab_size + draft_token_id, - orig_prob) From 16a45b3a281805ea4d4ff3908cef512fdf6d9f84 Mon Sep 17 00:00:00 2001 From: elvischenv <219235043+elvischenv@users.noreply.github.com> Date: Fri, 29 Aug 2025 03:36:50 +0800 Subject: [PATCH 04/56] [NVIDIA] Support SiluMul + NVFP4 quant fusion (#23671) Signed-off-by: jindih Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com> Co-authored-by: jindih Co-authored-by: Michael Goin Co-authored-by: Luka Govedic --- .buildkite/test-pipeline.yaml | 2 + CMakeLists.txt | 2 + csrc/dispatch_utils.h | 16 + csrc/ops.h | 8 + .../activation_nvfp4_quant_fusion_kernels.cu | 368 ++++++++++++++++++ csrc/torch_bindings.cpp | 7 + tests/compile/test_silu_mul_quant_fusion.py | 97 +++-- .../test_silu_nvfp4_quant_fusion.py | 126 ++++++ vllm/compilation/activation_quant_fusion.py | 170 ++++++-- vllm/compilation/fix_functionalization.py | 7 + .../layers/quantization/modelopt.py | 7 +- 11 files changed, 746 insertions(+), 64 deletions(-) create mode 100644 csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu create mode 100644 tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 24cc57e9dfb97..454aaca0a1121 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -668,6 +668,7 @@ steps: # Quantization - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py + - pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py @@ -677,6 +678,7 @@ steps: - pytest -v -s tests/compile/test_fusion_all_reduce.py - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern - pytest -v -s tests/kernels/moe/test_flashinfer.py + - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py ##### 1 GPU test ##### ##### multi gpus test ##### diff --git a/CMakeLists.txt b/CMakeLists.txt index b0eb0f32e03a5..e92e08f0d0ecd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -541,6 +541,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS) set(SRCS "csrc/quantization/fp4/nvfp4_quant_kernels.cu" + "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu" "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu") set_gencode_flags_for_srcs( SRCS "${SRCS}" @@ -559,6 +560,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS) set(SRCS "csrc/quantization/fp4/nvfp4_quant_kernels.cu" + "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu" "csrc/quantization/fp4/nvfp4_experts_quant.cu" "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu" "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu") diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h index f7b75c48373f6..2728aa81f0c9f 100644 --- a/csrc/dispatch_utils.h +++ b/csrc/dispatch_utils.h @@ -19,6 +19,13 @@ #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) +#define VLLM_DISPATCH_CASE_HALF_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) + +#define VLLM_DISPATCH_HALF_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_HALF_TYPES(__VA_ARGS__)) + // ROCm devices might use either fn or fnuz, so set up dispatch table for both. // A host-based check at runtime will create a preferred FP8 type for ROCm // such that the correct kernel is dispatched. @@ -45,6 +52,15 @@ #define VLLM_DISPATCH_FP8_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FP8_TYPES(__VA_ARGS__)) +#define AT_DISPATCH_BYTE_CASE(enum_type, ...) \ + AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, byte_t, __VA_ARGS__) + +#define VLLM_DISPATCH_CASE_BYTE_TYPES(...) \ + AT_DISPATCH_BYTE_CASE(at::ScalarType::Byte, __VA_ARGS__) + +#define VLLM_DISPATCH_BYTE_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_BYTE_TYPES(__VA_ARGS__)) + #define VLLM_DISPATCH_QUANT_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_QUANT_TYPES(__VA_ARGS__)) diff --git a/csrc/ops.h b/csrc/ops.h index 86fe848e2fd5a..78a487201bdd4 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -130,6 +130,14 @@ void silu_and_mul(torch::Tensor& out, torch::Tensor& input); void silu_and_mul_quant(torch::Tensor& out, torch::Tensor& input, torch::Tensor& scale); +#ifndef USE_ROCM + +void silu_and_mul_nvfp4_quant(torch::Tensor& out, + torch::Tensor& output_block_scale, + torch::Tensor& input, + torch::Tensor& input_global_scale); +#endif + void mul_and_silu(torch::Tensor& out, torch::Tensor& input); void gelu_and_mul(torch::Tensor& out, torch::Tensor& input); diff --git a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu new file mode 100644 index 0000000000000..9bbeb0334fb9a --- /dev/null +++ b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu @@ -0,0 +1,368 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include +#include + +#include +#include "dispatch_utils.h" + +#include "cuda_utils.h" + +namespace vllm { + +// Get type2 from type or vice versa (applied to half and bfloat16) +template +struct TypeConverter { + using Type = half2; +}; // keep for generality + +template <> +struct TypeConverter { + using Type = c10::Half; +}; + +template <> +struct TypeConverter { + using Type = half2; +}; + +template <> +struct TypeConverter<__nv_bfloat162> { + using Type = c10::BFloat16; +}; + +template <> +struct TypeConverter { + using Type = __nv_bfloat162; +}; + +#define ELTS_PER_THREAD 8 + +constexpr int CVT_FP4_ELTS_PER_THREAD = 8; +constexpr int CVT_FP4_SF_VEC_SIZE = 16; + +// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t). +inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) { +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) + uint32_t val; + asm volatile( + "{\n" + ".reg .b8 byte0;\n" + ".reg .b8 byte1;\n" + ".reg .b8 byte2;\n" + ".reg .b8 byte3;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte0, %2, %1;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte1, %4, %3;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte2, %6, %5;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte3, %8, %7;\n" + "mov.b32 %0, {byte0, byte1, byte2, byte3};\n" + "}" + : "=r"(val) + : "f"(array[0]), "f"(array[1]), "f"(array[2]), "f"(array[3]), + "f"(array[4]), "f"(array[5]), "f"(array[6]), "f"(array[7])); + return val; +#else + return 0; +#endif +} + +// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t). +inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) { +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) + uint32_t val; + asm volatile( + "{\n" + ".reg .b8 byte0;\n" + ".reg .b8 byte1;\n" + ".reg .b8 byte2;\n" + ".reg .b8 byte3;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte0, %2, %1;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte1, %4, %3;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte2, %6, %5;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte3, %8, %7;\n" + "mov.b32 %0, {byte0, byte1, byte2, byte3};\n" + "}" + : "=r"(val) + : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y), + "f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y)); + return val; +#else + return 0; +#endif +} + +// Fast reciprocal. +inline __device__ float reciprocal_approximate_ftz(float a) { + float b; + asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a)); + return b; +} + +template +__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx, + int numCols, + SFType* SFout) { +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) + static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 || + CVT_FP4_NUM_THREADS_PER_SF == 2); + + // One pair of threads write one SF to global memory. + // TODO: stage through smem for packed STG.32 + // is it better than STG.8 from 4 threads ? + if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) { + // SF vector index (16 elements share one SF in the K dimension). + int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF; + int32_t mIdx = rowIdx; + + // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)] + // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx] + + int32_t mTileIdx = mIdx / (32 * 4); + // SF vector size 16. + int factor = CVT_FP4_SF_VEC_SIZE * 4; + int32_t numKTiles = (numCols + factor - 1) / factor; + int64_t mTileStride = numKTiles * 32 * 4 * 4; + + int32_t kTileIdx = (kIdx / 4); + int64_t kTileStride = 32 * 4 * 4; + + // M tile layout [32, 4] is column-major. + int32_t outerMIdx = (mIdx % 32); + int64_t outerMStride = 4 * 4; + + int32_t innerMIdx = (mIdx % (32 * 4)) / 32; + int64_t innerMStride = 4; + + int32_t innerKIdx = (kIdx % 4); + int64_t innerKStride = 1; + + // Compute the global offset. + int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride + + outerMIdx * outerMStride + innerMIdx * innerMStride + + innerKIdx * innerKStride; + + return reinterpret_cast(SFout) + SFOffset; + } +#endif + return nullptr; +} + +// Define a 16 bytes packed data type. +template +struct PackedVec { + typename TypeConverter::Type elts[4]; +}; + +template <> +struct PackedVec<__nv_fp8_e4m3> { + __nv_fp8x2_e4m3 elts[8]; +}; + +template +__inline__ __device__ PackedVec compute_silu(PackedVec& vec, + PackedVec& vec2) { + PackedVec result; +#pragma unroll + for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; ++i) { + if constexpr (std::is_same_v) { + half2 val(0.5f, 0.5f); + half2 t0 = __hmul2(vec.elts[i], val); + half2 t1 = __hfma2(h2tanh(t0), val, val); + half2 t2 = __hmul2(vec.elts[i], t1); + result.elts[i] = __hmul2(t2, vec2.elts[i]); + } else { + __nv_bfloat162 val(0.5f, 0.5f); + __nv_bfloat162 t0 = __hmul2(vec.elts[i], val); + __nv_bfloat162 t1 = __hfma2(h2tanh(t0), val, val); + __nv_bfloat162 t2 = __hmul2(vec.elts[i], t1); + result.elts[i] = __hmul2(t2, vec2.elts[i]); + } + } + return result; +} + +// Quantizes the provided PackedVec into the uint32_t output +template +__device__ uint32_t silu_and_cvt_warp_fp16_to_fp4(PackedVec& vec, + PackedVec& vec2, + float SFScaleVal, + uint8_t* SFout) { +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) + PackedVec out_silu = compute_silu(vec, vec2); + // Get absolute maximum values among the local 8 values. + auto localMax = __habs2(out_silu.elts[0]); + + // Local maximum value. + #pragma unroll + for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) { + localMax = __hmax2(localMax, __habs2(out_silu.elts[i])); + } + + // Get the absolute maximum among all 16 values (two threads). + localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax); + // Get the final absolute maximum values. + float vecMax = float(__hmax(localMax.x, localMax.y)); + + // Get the SF (max value of the vector / max value of e2m1). + // maximum value of e2m1 = 6.0. + // TODO: use half as compute data type. + float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f)); + // 8 bits representation of the SF. + uint8_t fp8SFVal; + // Write the SF to global memory (STG.8). + if constexpr (UE8M0_SF) { + // Extract the 8 exponent bits from float32. + // float 32bits = 1 sign bit + 8 exponent bits + 23 mantissa bits. + uint32_t tmp = reinterpret_cast(SFValue) >> 23; + fp8SFVal = tmp & 0xff; + // Convert back to fp32. + reinterpret_cast(SFValue) = tmp << 23; + } else { + // Here SFValue is always positive, so E4M3 is the same as UE4M3. + __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue); + reinterpret_cast<__nv_fp8_e4m3&>(fp8SFVal) = tmp; + // Convert back to fp32. + SFValue = float(tmp); + } + // Get the output scale. + // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) * + // reciprocal(SFScaleVal)) + float outputScale = + SFValue != 0 ? reciprocal_approximate_ftz( + SFValue * reciprocal_approximate_ftz(SFScaleVal)) + : 0.0f; + + if (SFout) { + // Write the SF to global memory (STG.8). + *SFout = fp8SFVal; + } + + // Convert the input to float. + float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2]; + + #pragma unroll + for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) { + if constexpr (std::is_same_v) { + fp2Vals[i] = __half22float2(out_silu.elts[i]); + } else { + fp2Vals[i] = __bfloat1622float2(out_silu.elts[i]); + } + fp2Vals[i].x *= outputScale; + fp2Vals[i].y *= outputScale; + } + + // Convert to e2m1 values. + uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals); + + // Write the e2m1 values to global memory. + return e2m1Vec; +#else + return 0; +#endif +} + +// Use UE4M3 by default. +template +__global__ void +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) +__launch_bounds__(1024, 4) silu_and_cvt_fp16_to_fp4( +#else +silu_and_cvt_fp16_to_fp4( +#endif + int32_t numRows, int32_t numCols, Type const* in, float const* SFScale, + uint32_t* out, uint32_t* SFout) { +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) + using PackedVec = PackedVec; + static constexpr int CVT_FP4_NUM_THREADS_PER_SF = + (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD); + static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD, + "Vec size is not matched."); + + // Get the global scaling factor, which will be applied to the SF. + // Note SFScale is the same as next GEMM's alpha, which is + // (448.f / (Alpha_A / 6.f)). + float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0]; + + // Input tensor row/col loops. + for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) { + for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD; + colIdx += blockDim.x) { + int64_t inOffset = + rowIdx * (numCols * 2 / CVT_FP4_ELTS_PER_THREAD) + colIdx; + int64_t inOffset2 = rowIdx * (numCols * 2 / CVT_FP4_ELTS_PER_THREAD) + + numCols / CVT_FP4_ELTS_PER_THREAD + colIdx; + PackedVec in_vec = reinterpret_cast(in)[inOffset]; + PackedVec in_vec2 = reinterpret_cast(in)[inOffset2]; + + // Get the output tensor offset. + // Same as inOffset because 8 elements are packed into one uint32_t. + int64_t outOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx; + ; + auto& out_pos = out[outOffset]; + + auto sf_out = + cvt_quant_to_fp4_get_sf_out_offset( + rowIdx, colIdx, numCols, SFout); + + out_pos = silu_and_cvt_warp_fp16_to_fp4( + in_vec, in_vec2, SFScaleVal, sf_out); + } + } +#endif +} + +} // namespace vllm + +void silu_and_mul_nvfp4_quant(torch::Tensor& output, // [..., d] + torch::Tensor& output_sf, + torch::Tensor& input, // [..., 2 * d] + torch::Tensor& input_sf) { + TORCH_CHECK(input.dtype() == torch::kFloat16 || + input.dtype() == torch::kBFloat16); + int32_t m = input.size(0); + int32_t n = input.size(1) / 2; + TORCH_CHECK(n % 16 == 0, "The N dimension must be multiple of 16."); + int multiProcessorCount = + get_device_attribute(cudaDevAttrMultiProcessorCount, -1); + auto input_sf_ptr = static_cast(input_sf.data_ptr()); + auto sf_out = static_cast(output_sf.data_ptr()); + auto output_ptr = static_cast(output.data_ptr()); + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); + auto stream = at::cuda::getCurrentCUDAStream(input.get_device()); + dim3 block(std::min(int(n / ELTS_PER_THREAD), 1024)); + int const numBlocksPerSM = 2048 / block.x; + dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM)); + VLLM_DISPATCH_HALF_TYPES( + input.scalar_type(), "act_and_mul_quant_kernel", [&] { + auto input_ptr = reinterpret_cast(input.data_ptr()); + VLLM_DISPATCH_BYTE_TYPES( + output.scalar_type(), "fused_act_and_mul_quant_kernel_nvfp4_type", + [&] { + vllm::silu_and_cvt_fp16_to_fp4 + <<>>( + m, n, input_ptr, input_sf_ptr, + reinterpret_cast(output_ptr), + reinterpret_cast(sf_out)); + }); + }); +} diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 608b724403076..b769c09adc0f0 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -115,6 +115,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "silu_and_mul_quant(Tensor! result, Tensor input, Tensor scale) -> ()"); ops.impl("silu_and_mul_quant", torch::kCUDA, &silu_and_mul_quant); +#ifndef USE_ROCM + ops.def( + "silu_and_mul_nvfp4_quant(Tensor! result, Tensor! result_block_scale, " + "Tensor input, Tensor input_global_scale) -> ()"); + ops.impl("silu_and_mul_nvfp4_quant", torch::kCUDA, &silu_and_mul_nvfp4_quant); +#endif + ops.def("mul_and_silu(Tensor! out, Tensor input) -> ()"); ops.impl("mul_and_silu", torch::kCUDA, &mul_and_silu); diff --git a/tests/compile/test_silu_mul_quant_fusion.py b/tests/compile/test_silu_mul_quant_fusion.py index 0e1059e654479..fcc2589e42116 100644 --- a/tests/compile/test_silu_mul_quant_fusion.py +++ b/tests/compile/test_silu_mul_quant_fusion.py @@ -4,32 +4,41 @@ import pytest import torch import vllm.envs as envs -from vllm.compilation.activation_quant_fusion import ActivationQuantFusionPass -from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe +from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant +# yapf conflicts with isort for this block +# yapf: disable +from vllm.compilation.activation_quant_fusion import ( + FUSED_OPS, SILU_MUL_OP, ActivationQuantFusionPass) +# yapf: enable +from vllm.compilation.fusion import QUANT_OPS from vllm.compilation.noop_elimination import NoOpEliminationPass from vllm.config import CompilationConfig, PassConfig, VllmConfig from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.quantization.utils.quant_utils import ( - GroupShape) + GroupShape, kFp8StaticTensorSym, kNvfp4Quant) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( Fp8LinearOp) from vllm.platforms import current_platform from .backend import TestBackend +FP8_DTYPE = current_platform.fp8_dtype() +FP4_DTYPE = torch.uint8 -class TestModel(torch.nn.Module): - def __init__(self, hidden_size: int, force_fp8_e4m3fnuz: bool, *args, - **kwargs): - super().__init__(*args, **kwargs) +def is_nvfp4_supported(): + return current_platform.has_device_capability(100) + + +class TestSiluMulFp8QuantModel(torch.nn.Module): + + def __init__(self, hidden_size: int, force_fp8_e4m3fnuz: bool, **kwargs): + super().__init__() self.silu_and_mul = SiluAndMul() self.wscale = torch.rand(1, dtype=torch.float32) self.scale = torch.rand(1, dtype=torch.float32) - self.w = (torch.rand( - hidden_size, - hidden_size).to(dtype=current_platform.fp8_dtype()).t()) + self.w = torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t() self.fp8_linear = Fp8LinearOp( force_fp8_e4m3fnuz=force_fp8_e4m3fnuz, @@ -45,14 +54,56 @@ class TestModel(torch.nn.Module): input_scale=self.wscale) return x2 + def ops_in_model_before(self): + return [SILU_MUL_OP, QUANT_OPS[kFp8StaticTensorSym]] -@pytest.mark.parametrize("num_tokens", [256]) -@pytest.mark.parametrize("hidden_size", [64]) + def ops_in_model_after(self): + return [FUSED_OPS[kFp8StaticTensorSym]] + + +class TestSiluMulNvfp4QuantModel(torch.nn.Module): + + def __init__(self, hidden_size: int, **kwargs): + super().__init__() + self.silu_and_mul = SiluAndMul() + self.w = torch.randint(256, (hidden_size, hidden_size // 2), + dtype=FP4_DTYPE) + self.wscale = torch.randn(hidden_size, + hidden_size // 16).to(dtype=FP8_DTYPE) + self.wscale2 = torch.rand(1, dtype=torch.float32) + self.scale = torch.rand(1, dtype=torch.float32) + + def forward(self, x): + y = self.silu_and_mul(x) + y_quant, y_block_scale = scaled_fp4_quant(y, 1 / self.scale) + out = cutlass_scaled_fp4_mm(a=y_quant, + b=self.w, + block_scale_a=y_block_scale, + block_scale_b=self.wscale, + alpha=self.scale * self.wscale2, + out_dtype=y.dtype) + return out + + def ops_in_model_before(self): + return [SILU_MUL_OP, QUANT_OPS[kNvfp4Quant]] + + def ops_in_model_after(self): + return [FUSED_OPS[kNvfp4Quant]] + + +@pytest.mark.parametrize("num_tokens", [64]) +@pytest.mark.parametrize("hidden_size", [128]) +@pytest.mark.parametrize( + "model_class", [TestSiluMulFp8QuantModel, TestSiluMulNvfp4QuantModel] + if is_nvfp4_supported() else [TestSiluMulFp8QuantModel]) @pytest.mark.parametrize("force_fp8_e4m3fnuz", [True, False]) @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"], reason="Only test on CUDA and ROCm") -def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, +def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class, force_fp8_e4m3fnuz): + if model_class == TestSiluMulNvfp4QuantModel and force_fp8_e4m3fnuz: + pytest.skip("Duplicate tests for NVFP4") + torch.set_default_device("cuda") torch.set_default_dtype(torch.float16) @@ -63,7 +114,8 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, fusion_pass = ActivationQuantFusionPass(config) backend = TestBackend(NoOpEliminationPass(config), fusion_pass) - model = TestModel(hidden_size, force_fp8_e4m3fnuz) + model = model_class(hidden_size=hidden_size, + force_fp8_e4m3fnuz=force_fp8_e4m3fnuz) # First dimension dynamic x = torch.rand(num_tokens, hidden_size * 2) @@ -80,17 +132,8 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, atol=1e-3, rtol=1e-3) - # Check substitution worked - pre_nodes = backend.graph_pre_pass.nodes - post_nodes = backend.graph_post_pass.nodes + # In pre-nodes, quant op should be present and fused kernels should not + backend.check_before_ops(model.ops_in_model_before()) - silu_and_mul_quant = torch.ops._C.silu_and_mul_quant.default - fp8_quant = torch.ops._C.static_scaled_fp8_quant.default - - # In pre-nodes, fp8 quant should be present and fused kernels should not - assert find_auto_fn_maybe(pre_nodes, silu_and_mul_quant) is None - find_auto_fn(pre_nodes, fp8_quant) - - # In post-nodes, fused kernels should be present and fp8 quant should not - find_auto_fn(post_nodes, silu_and_mul_quant) - assert find_auto_fn_maybe(post_nodes, fp8_quant) is None + # In post-nodes, fused kernels should be present and quant op should not + backend.check_after_ops(model.ops_in_model_after()) diff --git a/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py b/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py new file mode 100644 index 0000000000000..969f14cc3fe62 --- /dev/null +++ b/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py @@ -0,0 +1,126 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest +import torch + +from tests.kernels.utils import opcheck +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.platforms import current_platform +from vllm.scalar_type import scalar_types + +if not current_platform.has_device_capability(100): + pytest.skip(reason="Nvfp4 Requires compute capability of 10 or above.", + allow_module_level=True) + +DTYPES = [torch.float16, torch.bfloat16] +SHAPES = [(128, 64), (128, 128), (256, 64), (256, 128)] +SEEDS = [42] +CUDA_DEVICES = ['cuda:0'] + +FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max() +FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max + +BLOCK_SIZE = 16 + + +def ref_impl(silu_and_mul: SiluAndMul, x: torch.Tensor, + global_scale: torch.Tensor, + ref_output_scale: torch.Tensor) -> torch.Tensor: + silu_and_mul_out = silu_and_mul.forward_native(x) + assert not current_platform.is_rocm() + assert silu_and_mul_out.ndim >= 1, ( + f'input.ndim needs to be >= 1, but got {silu_and_mul_out.ndim}.') + other_dims = 1 if silu_and_mul_out.ndim == 1 else -1 + silu_and_mul_out = silu_and_mul_out.reshape(other_dims, + silu_and_mul_out.shape[-1]) + m, n = silu_and_mul_out.shape + device = silu_and_mul_out.device + + # Two fp4 values will be packed into an uint8. + out = torch.empty((m, n // 2), device=device, dtype=torch.uint8) + + output_scale = ref_output_scale + + torch.ops._C.scaled_fp4_quant(out, silu_and_mul_out, output_scale, + global_scale) + + return out, output_scale + + +def ops_impl(x: torch.Tensor, global_scale: torch.Tensor, + ref_output_scale: torch.Tensor) -> torch.Tensor: + out_shape = (x.shape[0], x.shape[1] // 4) + output_scale = ref_output_scale + out = torch.empty(out_shape, dtype=torch.uint8, device=x.device) + torch.ops._C.silu_and_mul_nvfp4_quant(out, output_scale, x, global_scale) + return out, output_scale + + +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("shape", SHAPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@torch.inference_mode() +def test_quantize_to_fp4( + dtype: torch.dtype, + shape: tuple[int, int], + seed: int, + device: str, +) -> None: + current_platform.seed_everything(seed) + torch.set_default_device(device) + + m, n = shape + + x = torch.randn((m, n), dtype=dtype) + tensor_amax = torch.abs(x).max().to(torch.float32) + global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / tensor_amax + + block_size = 16 + + assert n % block_size == 0, ( + f'last dim has to be multiple of 16, but got {n}.') + assert x.dtype in (torch.float16, torch.bfloat16), ( + f'input.dtype needs to be fp16 or bf16 but got {x.dtype}.') + + round_up = lambda x, y: (x + y - 1) // y * y + rounded_m = round_up(x.shape[0], 128) + scale_n = x.shape[1] // (2 * block_size) + rounded_n = round_up(scale_n, 4) + output_scale = torch.empty((rounded_m, rounded_n // 4), + device=x.device, + dtype=torch.int32) + + layer = SiluAndMul() + + ref_out, ref_out_scale = ref_impl(layer, x, global_scale, output_scale) + + fusion_out, fusion_out_scale = ops_impl(x, global_scale, output_scale) + + assert ref_out.dtype == torch.uint8 + assert fusion_out.dtype == torch.uint8 + assert ref_out.shape == fusion_out.shape + + assert ref_out_scale.dtype == torch.int32 + assert fusion_out_scale.dtype == torch.int32 + assert ref_out_scale.shape == fusion_out_scale.shape + + # Allow up to 2% of mismatched values since BF16 has accuracy issues. + mis_threshold = 0.02 + atol = 0.4 + rtol = 0.4 + ref_logits = ref_out[-1] + fusion_logits = fusion_out[-1] + + mis_count = torch.sum( + torch.abs(fusion_logits - ref_logits) > (atol + + rtol * torch.abs(ref_logits))) + mis_ratio = mis_count / fusion_logits.numel() + + assert mis_ratio < mis_threshold, \ + f"Mismatch ratio {mis_ratio} exceeds threshold {mis_threshold}" + + torch.testing.assert_close(ref_out_scale, fusion_out_scale) + + opcheck(torch.ops._C.silu_and_mul_nvfp4_quant, + (fusion_out, fusion_out_scale, x, global_scale)) diff --git a/vllm/compilation/activation_quant_fusion.py b/vllm/compilation/activation_quant_fusion.py index 826014f770df3..40e124a03eb08 100644 --- a/vllm/compilation/activation_quant_fusion.py +++ b/vllm/compilation/activation_quant_fusion.py @@ -1,55 +1,154 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from abc import ABC, abstractmethod + import torch from torch._higher_order_ops.auto_functionalize import auto_functionalized from torch._inductor.pattern_matcher import (PatternMatcherPass, fwd_only, register_replacement) +from torch._ops import OpOverload from vllm.config import VllmConfig from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + QuantKey, kFp8StaticTensorSym, kNvfp4Quant, kStaticTensorScale) from vllm.platforms import current_platform +from .fusion import QUANT_OPS, empty_bf16, empty_fp32, empty_i32 from .inductor_pass import enable_fake_mode from .vllm_inductor_pass import VllmInductorPass logger = init_logger(__name__) +FP8_DTYPE = current_platform.fp8_dtype() +FP4_DTYPE = torch.uint8 -def silu_mul_pattern_static(result: torch.Tensor, - result_silu_mul: torch.Tensor, input: torch.Tensor, - scale: torch.Tensor): - at1 = auto_functionalized(torch.ops._C.silu_and_mul.default, - result=result_silu_mul, - input=input) - at2 = auto_functionalized(torch.ops._C.static_scaled_fp8_quant.default, - result=result, - input=at1[1], - scale=scale) - return at2[1] +SILU_MUL_OP = torch.ops._C.silu_and_mul.default + +FUSED_OPS: dict[QuantKey, OpOverload] = { + kFp8StaticTensorSym: torch.ops._C.silu_and_mul_quant.default, # noqa: E501 +} +if current_platform.is_cuda() and hasattr(torch.ops._C, + "silu_and_mul_nvfp4_quant"): + FUSED_OPS[ + kNvfp4Quant] = torch.ops._C.silu_and_mul_nvfp4_quant.default # noqa: E501 -def silu_mul_replacement_static(result: torch.Tensor, - result_silu_mul: torch.Tensor, - input: torch.Tensor, scale: torch.Tensor): - at = auto_functionalized(torch.ops._C.silu_and_mul_quant.default, - result=result, - input=input, - scale=scale) - return at[1] +class ActivationQuantPattern(ABC): + """ + The base class for Activation+Quant fusions. + Should not be used directly. + """ + + def __init__( + self, + quant_key: QuantKey, + ): + self.quant_key = quant_key + self.quant_dtype = quant_key.dtype + + assert self.quant_key in QUANT_OPS, \ + f"unsupported quantization scheme {self.quant_key}" + self.QUANT_OP = QUANT_OPS[self.quant_key] + + assert self.quant_key in FUSED_OPS, \ + f"unsupported fusion scheme {self.quant_key}" + self.FUSED_OP = FUSED_OPS[self.quant_key] + + def empty_quant(self, *args, **kwargs): + kwargs = {'dtype': self.quant_dtype, 'device': "cuda", **kwargs} + return torch.empty(*args, **kwargs) + + @abstractmethod + def register(self, pm_pass: PatternMatcherPass): + raise NotImplementedError -def empty_bf16(*args, **kwargs): - return torch.empty(*args, **kwargs, dtype=torch.bfloat16, device="cuda") +class SiluMulFp8StaticQuantPattern(ActivationQuantPattern): + """ + Fusion for SiluMul+Fp8StaticQuant Pattern + """ + + def __init__(self, symmetric: bool = True): + quant_key = QuantKey(dtype=FP8_DTYPE, + scale=kStaticTensorScale, + symmetric=symmetric) + super().__init__(quant_key) + + def register(self, pm_pass: PatternMatcherPass): + + def pattern(result: torch.Tensor, result_silu_mul: torch.Tensor, + input: torch.Tensor, scale: torch.Tensor): + at1 = auto_functionalized(SILU_MUL_OP, + result=result_silu_mul, + input=input) + at2 = auto_functionalized(self.QUANT_OP, + result=result, + input=at1[1], + scale=scale) + return at2[1] + + def replacement(result: torch.Tensor, result_silu_mul: torch.Tensor, + input: torch.Tensor, scale: torch.Tensor): + at = auto_functionalized(self.FUSED_OP, + result=result, + input=input, + scale=scale) + return at[1] + + inputs = [ + self.empty_quant(5, 4), # result + empty_bf16(5, 4), # result_silu_mul + empty_bf16(5, 4), # input + empty_fp32(1, 1) # scale + ] + + register_replacement(pattern, replacement, inputs, fwd_only, pm_pass) -def empty_fp8(*args, **kwargs): - fp8 = current_platform.fp8_dtype() - return torch.empty(*args, **kwargs, dtype=fp8, device="cuda") +class SiluMulNvfp4QuantPattern(ActivationQuantPattern): + """ + Fusion for SiluMul+Nvfp4Quant Pattern + """ + def __init__(self): + super().__init__(kNvfp4Quant) -def empty_fp32(*args, **kwargs): - return torch.empty(*args, **kwargs, dtype=torch.float32, device="cuda") + def register(self, pm_pass: PatternMatcherPass): + + def pattern(result: torch.Tensor, output_scale: torch.Tensor, + result_silu_mul: torch.Tensor, input: torch.Tensor, + scale: torch.Tensor): + at1 = auto_functionalized(SILU_MUL_OP, + result=result_silu_mul, + input=input) + at2 = auto_functionalized(self.QUANT_OP, + output=result, + input=at1[1], + output_scale=output_scale, + input_scale=scale) + return at2[1], at2[2] + + def replacement(result: torch.Tensor, output_scale: torch.Tensor, + result_silu_mul: torch.Tensor, input: torch.Tensor, + scale: torch.Tensor): + at = auto_functionalized(self.FUSED_OP, + result=result, + result_block_scale=output_scale, + input=input, + input_global_scale=scale) + return at[1], at[2] + + inputs = [ + self.empty_quant(5, 32), # result + empty_i32(128, 4), # output_scale + empty_bf16(5, 64), # result_silu_mul + empty_bf16(5, 64), # input + empty_fp32(1, 1) # scale + ] + + register_replacement(pattern, replacement, inputs, fwd_only, pm_pass) class ActivationQuantFusionPass(VllmInductorPass): @@ -69,15 +168,11 @@ class ActivationQuantFusionPass(VllmInductorPass): self.patterns: PatternMatcherPass = PatternMatcherPass( pass_name="activation_quant_fusion_pass") - inputs = [ - empty_fp8(5, 4), # Quant output - empty_bf16(5, 4), # Silu_and_mul output - empty_bf16(5, 4), # Input - empty_fp32(1, 1) # Scale - ] - register_replacement(silu_mul_pattern_static, - silu_mul_replacement_static, inputs, fwd_only, - self.patterns) + pattern_silu_mul_fp8 = SiluMulFp8StaticQuantPattern() + pattern_silu_mul_fp8.register(self.patterns) + + pattern_silu_mul_nvfp4 = SiluMulNvfp4QuantPattern() + pattern_silu_mul_nvfp4.register(self.patterns) def __call__(self, graph: torch.fx.Graph): self.begin() @@ -89,3 +184,8 @@ class ActivationQuantFusionPass(VllmInductorPass): self.dump_graph(graph, "after_act_quant_fusion") self.end_and_log() + + def uuid(self): + return VllmInductorPass.hash_source(self, ActivationQuantPattern, + SiluMulFp8StaticQuantPattern, + SiluMulNvfp4QuantPattern) diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py index 60ae143318790..a36dd8b845f1a 100644 --- a/vllm/compilation/fix_functionalization.py +++ b/vllm/compilation/fix_functionalization.py @@ -97,6 +97,13 @@ class FixFunctionalizationPass(VllmInductorPass): node, mutated_args, args=('result', 'input', 'scale')) + elif at_target == torch.ops._C.silu_and_mul_nvfp4_quant.default: + mutated_args = {1: 'result', 2: 'result_block_scale'} + self.defunctionalize(graph, + node, + mutated_args, + args=('result', 'result_block_scale', + 'input', 'input_global_scale')) else: continue # skip the count diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 9d4e453ffc545..1fbb2e3bb6f28 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -885,6 +885,10 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase): layer.alpha = Parameter(layer.input_scale * layer.weight_scale_2, requires_grad=False) + # Calculate `1 / input_scale` so that we don't need to do so at runtime + layer.input_scale_inv = Parameter( + (1 / layer.input_scale).to(torch.float32), requires_grad=False) + # Swizzle the weight blockscale. # contracting dimension is input dimension # block_size = 16; @@ -941,8 +945,7 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase): output_shape = [x.shape[0], layer.weight.shape[0]] # quantize BF16 or FP16 to (FP4 and interleaved block scale) - s_quant = 1 / layer.input_scale - x_fp4, x_blockscale = scaled_fp4_quant(x, s_quant) + x_fp4, x_blockscale = scaled_fp4_quant(x, layer.input_scale_inv) # validate dtypes of quantized input, input block scale, # weight and weight_blockscale From 27e88cee748d41e07268ca140d15252c6b38acf1 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Thu, 28 Aug 2025 13:17:15 -0700 Subject: [PATCH 05/56] chore: build release image by default (#23852) Signed-off-by: Codex --- .buildkite/release-pipeline.yaml | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 86aae426c258c..92a1bcada3879 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -62,12 +62,8 @@ steps: env: DOCKER_BUILDKIT: "1" - - block: "Build release image (x86)" - depends_on: ~ - key: block-release-image-build - - label: "Build release image (x86)" - depends_on: block-release-image-build + depends_on: ~ id: build-release-image-x86 agents: queue: cpu_queue_postmerge @@ -80,7 +76,7 @@ steps: - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - label: "Build release image (arm64)" - depends_on: block-release-image-build + depends_on: ~ id: build-release-image-arm64 agents: queue: arm64_cpu_queue_postmerge From 7ffbf27239c3ff68d773e7d2e2cd284f1375349f Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 28 Aug 2025 14:22:46 -0700 Subject: [PATCH 06/56] [BugFix][FlashInfer] Fix potential race condition for paged_kv_indptr_cpu (#23737) Signed-off-by: Woosuk Kwon --- vllm/v1/attention/backends/flashinfer.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 70d3471a47259..5fc3a1517b690 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -237,6 +237,8 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): device="cpu", pin_memory=pin_memory) self.paged_kv_indptr_np = self.paged_kv_indptr_cpu.numpy() + self.paged_kv_indptr_buffer = torch.zeros_like( + self.paged_kv_indptr_cpu, pin_memory=pin_memory) self.paged_kv_indices_cpu = torch.zeros(max_num_pages, dtype=torch.int32, device="cpu", @@ -361,12 +363,18 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): dtype=np.int32, out=self.paged_kv_indptr_np[1:num_reqs + 1], ) + # NOTE(woosuk): Because self.paged_kv_indptr_cpu can be modified + # after this line (e.g., for cuda graphs), we need to copy the data to + # self.paged_kv_indptr_buffer to avoid race condition. + self.paged_kv_indptr_buffer[:num_reqs + + 1] = (self.paged_kv_indptr_cpu[:num_reqs + + 1]) paged_kv_indptr = self.paged_kv_indptr[:num_reqs + 1] - paged_kv_indptr.copy_(self.paged_kv_indptr_cpu[:num_reqs + 1], + paged_kv_indptr.copy_(self.paged_kv_indptr_buffer[:num_reqs + 1], non_blocking=True) # write self.paged_kv_indices inplace - num_actual_pages = num_blocks_np.sum().item() + num_actual_pages = self.paged_kv_indptr_np[num_reqs] paged_kv_indices = self.paged_kv_indices[:num_actual_pages] _copy_page_indices_kernel[(num_reqs, )]( paged_kv_indices, From cb293f6a790d555d6d7ced872118ff029bd828e8 Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Date: Thu, 28 Aug 2025 14:54:30 -0700 Subject: [PATCH 07/56] [V1] Enable prefill optimization for Gemma3n (#22628) Signed-off-by: Yong Hoon Shin --- tests/v1/e2e/test_kv_sharing_fast_prefill.py | 57 --- vllm/config/cache.py | 12 +- vllm/model_executor/models/gemma3n.py | 435 +++++++++++++++---- vllm/model_executor/models/gemma3n_mm.py | 2 +- vllm/v1/attention/backends/utils.py | 139 +++++- vllm/v1/engine/async_llm.py | 7 + vllm/v1/worker/gpu_model_runner.py | 96 ++-- vllm/v1/worker/tpu_model_runner.py | 39 +- vllm/v1/worker/utils.py | 40 +- 9 files changed, 591 insertions(+), 236 deletions(-) diff --git a/tests/v1/e2e/test_kv_sharing_fast_prefill.py b/tests/v1/e2e/test_kv_sharing_fast_prefill.py index d72e50e5196b8..7bc7f44dd7ab1 100644 --- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py +++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random -from typing import Optional, Union import pytest import torch @@ -10,12 +9,6 @@ import torch from vllm import LLM, SamplingParams from vllm.config import CompilationConfig, CompilationLevel from vllm.distributed import cleanup_dist_env_and_memory -from vllm.forward_context import get_forward_context -from vllm.model_executor.models.gemma3n_mm import ( - Gemma3nForConditionalGeneration) -from vllm.model_executor.models.registry import ModelRegistry -from vllm.model_executor.models.utils import extract_layer_index -from vllm.sequence import IntermediateTensors from ...utils import fork_new_process_for_each_test @@ -23,54 +16,6 @@ from ...utils import fork_new_process_for_each_test SEED = 42 -class TestGemma3nForConditionalGeneration(Gemma3nForConditionalGeneration): - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[torch.Tensor] = None, - **kwargs, - ) -> Union[torch.Tensor, IntermediateTensors]: - hidden_states = super().forward(input_ids, positions, - intermediate_tensors, inputs_embeds, - **kwargs) - attn_metadata = get_forward_context().attn_metadata - # attn_metadata is None during dummy runs - if (attn_metadata is not None - and self.language_model.cache_config.kv_sharing_fast_prefill): - assert isinstance(attn_metadata, dict) # true in V1 - # Gemma3n-E2B has 30 layers, with last 20 layers being - # cross-decoder layers. Check attention metadata is correct - for layer_name, metadata in attn_metadata.items(): - layer_idx = extract_layer_index(layer_name) - if layer_idx >= 20: - assert hasattr(metadata, 'logits_indices_padded') - assert hasattr(metadata, 'num_logits_indices') - else: - assert not hasattr(metadata, 'logits_indices_padded') - assert not hasattr(metadata, 'num_logits_indices') - - # Last layer will be a KV sharing layer - layer_attn_metadata = attn_metadata[ - self.language_model.model.layers[-1].self_attn.attn.layer_name] - logits_indices_padded = (layer_attn_metadata.logits_indices_padded) - assert logits_indices_padded is not None - num_logits_indices = layer_attn_metadata.num_logits_indices - assert num_logits_indices > 0 - # Reset hidden states to random values and - # only set logits at logits_indices to valid values - # Because logits_indices are the only positions that are used - # for output token sampling, this still produces same outputs - logits_hs = hidden_states[logits_indices_padded] - hidden_states = torch.randn_like(hidden_states) - gen_indices = logits_indices_padded[:num_logits_indices] - hidden_states[gen_indices] = logits_hs[:num_logits_indices] - - return hidden_states - - @pytest.fixture def test_prompts(): """ @@ -124,8 +69,6 @@ def test_kv_sharing_fast_prefill( enforce_eager: bool, test_prompts: list[str], ): - ModelRegistry.register_model("Gemma3nForConditionalGeneration", - TestGemma3nForConditionalGeneration) sampling_params = SamplingParams(temperature=0.0, max_tokens=100) compilation_config = CompilationConfig( # This allows vLLM compilation backend to handle allocating and diff --git a/vllm/config/cache.py b/vllm/config/cache.py index 3d2aa6b17be79..79761e7844859 100644 --- a/vllm/config/cache.py +++ b/vllm/config/cache.py @@ -145,12 +145,19 @@ class CacheConfig: self._verify_cache_dtype() self._verify_prefix_caching() + self._verify_kv_sharing_fast_prefill() def metrics_info(self): # convert cache_config to dict(key: str, value: str) for prometheus # metrics info return {key: str(value) for key, value in self.__dict__.items()} + def _verify_kv_sharing_fast_prefill(self) -> None: + if self.kv_sharing_fast_prefill and not envs.VLLM_USE_V1: + raise NotImplementedError( + "Fast prefill optimization for KV sharing is not supported " + "in V0 currently.") + @model_validator(mode='after') def _verify_args(self) -> Self: if self.cpu_offload_gb < 0: @@ -162,11 +169,6 @@ class CacheConfig: "GPU memory utilization must be less than 1.0. Got " f"{self.gpu_memory_utilization}.") - if self.kv_sharing_fast_prefill: - logger.warning_once( - "--kv-sharing-fast-prefill is currently work in progress " - "and not functional yet (i.e. no prefill savings)") - return self def _verify_cache_dtype(self) -> None: diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py index ffec3408702c9..0e0e191e75fcf 100644 --- a/vllm/model_executor/models/gemma3n.py +++ b/vllm/model_executor/models/gemma3n.py @@ -23,9 +23,11 @@ from torch import nn from transformers.models.gemma3n.configuration_gemma3n import Gemma3nTextConfig from vllm.attention import Attention +from vllm.compilation.backends import set_model_tag from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.forward_context import get_forward_context from vllm.logger import init_logger from vllm.model_executor.layers.activation import (_ACTIVATION_REGISTRY, GeluAndMul, @@ -45,6 +47,7 @@ from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors +from vllm.v1.attention.backends.utils import KVSharingFastPrefillMetadata from .interfaces import SupportsQuant from .utils import (AutoWeightsLoader, extract_layer_index, @@ -533,7 +536,178 @@ class Gemma3nDecoderLayer(nn.Module): return corrected_predictions -@support_torch_compile +# This enables torch.compile if --kv-sharing-fast-prefill passed +@support_torch_compile(enable_if=lambda vllm_config: vllm_config.cache_config. + kv_sharing_fast_prefill) +class Gemma3nSelfDecoder(nn.Module): + """ + Includes altup embedding and self decoder layers + """ + + def __init__( + self, + *, + vllm_config: VllmConfig, + prefix: str = "", + decoder_layers: list[Gemma3nDecoderLayer], + layer_idx_start: int, + per_layer_model_projection: ColumnParallelLinear, + embed_scale_per_layer: torch.Tensor, + embed_tokens_per_layer: VocabParallelEmbedding, + per_layer_projection_norm: RMSNorm, + per_layer_input_scale: torch.Tensor, + altup_projections: nn.ModuleList, + eps: torch.Tensor, + embed_tokens: VocabParallelEmbedding, + embed_scale: torch.Tensor, + ): + super().__init__() + self.decoder_layers = decoder_layers + self.layer_idx_start = layer_idx_start + self.per_layer_model_projection = per_layer_model_projection + self.config = vllm_config.model_config.hf_config + self.embed_scale_per_layer = embed_scale_per_layer + self.embed_tokens_per_layer = embed_tokens_per_layer + self.per_layer_projection_norm = per_layer_projection_norm + self.per_layer_input_scale = per_layer_input_scale + self.altup_projections = altup_projections + self.eps = eps + self.embed_tokens = embed_tokens + self.embed_scale = embed_scale + + def get_per_layer_input_embeddings( + self, input_ids: torch.Tensor) -> torch.Tensor: + # Deal with the fact that vocab_size_per_layer_input < vocab_size + # which causes us to have some out of vocab tokens by setting + # those token ids to 0. This matches the HF implementation. + per_layer_inputs_mask = torch.logical_and( + input_ids >= 0, input_ids < self.config.vocab_size_per_layer_input) + per_layer_inputs_tokens = torch.where(per_layer_inputs_mask, input_ids, + torch.zeros_like(input_ids)) + return self.embed_tokens_per_layer( + per_layer_inputs_tokens) * self.embed_scale_per_layer + + def get_per_layer_inputs( + self, + hidden_states_0: torch.Tensor, + per_layer_inputs: Optional[torch.Tensor], + ) -> torch.Tensor: + per_layer_projection = self.per_layer_model_projection(hidden_states_0) + per_layer_projection = per_layer_projection.reshape( + *hidden_states_0.shape[:-1], + self.config.num_hidden_layers, + self.config.hidden_size_per_layer_input, + ) + per_layer_projection = self.per_layer_projection_norm( + per_layer_projection) + if per_layer_inputs is not None: + # Profiling run does not compute per_layer_inputs + per_layer_inputs = per_layer_projection + per_layer_inputs + per_layer_inputs *= self.per_layer_input_scale + else: + per_layer_inputs = per_layer_projection + return per_layer_inputs + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) * self.embed_scale + + def altup_embed(self, hidden_states_0: torch.Tensor) -> torch.Tensor: + # Altup embed. + hidden_states = [hidden_states_0] * self.config.altup_num_inputs + target_magnitude = torch.mean(hidden_states_0**2, dim=-1, + keepdim=True)**0.5 + for i in range(1, self.config.altup_num_inputs): + hidden_states[i] = self.altup_projections[i - 1](hidden_states[i]) + new_magnitude = torch.mean(hidden_states[i]**2, + dim=-1, + keepdim=True)**0.5 + hidden_states[i] *= target_magnitude / torch.maximum( + new_magnitude, self.eps) + hidden_states = torch.stack(hidden_states, dim=-1) + return hidden_states + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + inputs_embeds: Optional[torch.Tensor] = None, + per_layer_inputs: Optional[torch.Tensor] = None, + **kwargs, + ) -> tuple[torch.Tensor, torch.Tensor]: + if inputs_embeds is not None: + hidden_states_0 = inputs_embeds + else: + hidden_states_0 = self.get_input_embeddings(input_ids) + + adjusted_per_layer_inputs = self.get_per_layer_inputs( + hidden_states_0, per_layer_inputs) + hidden_states = self.altup_embed(hidden_states_0) + + # [altnum_inputs, num_tokens, hidden_size] + hidden_states = hidden_states.permute(2, 0, 1) + + for idx, layer in enumerate(self.decoder_layers): + layer_idx = idx + self.layer_idx_start + # [altup_num_inputs, num_tokens, hidden_size] + hidden_states = layer( + positions=positions, + hidden_states=hidden_states, + per_layer_input=adjusted_per_layer_inputs[:, layer_idx, :], + **kwargs, + ) + + # [num_tokens, hidden_size, altnum_inputs] + hidden_states = hidden_states.permute(1, 2, 0) + + return hidden_states, adjusted_per_layer_inputs + + +# This enables torch.compile if --kv-sharing-fast-prefill passed +@support_torch_compile(enable_if=lambda vllm_config: vllm_config.cache_config. + kv_sharing_fast_prefill) +class Gemma3nCrossDecoder(nn.Module): + """ + Cross-decoder layers + """ + + def __init__( + self, + *, + vllm_config: VllmConfig, + prefix: str = "", + decoder_layers: list[Gemma3nDecoderLayer], + layer_idx_start: int, + ): + super().__init__() + self.decoder_layers = decoder_layers + self.layer_idx_start = layer_idx_start + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + per_layer_inputs: torch.Tensor, + **kwargs, + ) -> torch.Tensor: + # [altnum_inputs, num_tokens, hidden_size] + hidden_states = hidden_states.permute(2, 0, 1) + for idx, layer in enumerate(self.decoder_layers): + layer_idx = idx + self.layer_idx_start + # [altup_num_inputs, num_tokens, hidden_size] + hidden_states = layer( + positions=positions, + hidden_states=hidden_states, + per_layer_input=per_layer_inputs[:, layer_idx, :], + **kwargs, + ) + # [num_tokens, hidden_size, altnum_inputs] + hidden_states = hidden_states.permute(1, 2, 0) + return hidden_states + + +# This disables torch.compile if --kv-sharing-fast-prefill passed +@support_torch_compile(enable_if=lambda vllm_config: not vllm_config. + cache_config.kv_sharing_fast_prefill) class Gemma3nTextModel(nn.Module, SupportsQuant): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @@ -543,7 +717,6 @@ class Gemma3nTextModel(nn.Module, SupportsQuant): quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config - self.embed_tokens = VocabParallelEmbedding( config.vocab_size, config.hidden_size, @@ -613,95 +786,211 @@ class Gemma3nTextModel(nn.Module, SupportsQuant): lambda prefix: Gemma3nDecoderLayer( config, cache_config, quant_config, prefix=prefix), prefix=f"{prefix}.layers") + + self.eps = torch.tensor(torch.finfo().min) + + first_kv_shared_layer_idx = (config.num_hidden_layers - + config.num_kv_shared_layers) + # Layer idx 0-19 are self-decoder layers in You Only Cache Once (YOCO) + with set_model_tag("self_decoder"): + self.self_decoder = Gemma3nSelfDecoder( + vllm_config=vllm_config, + prefix=f"{prefix}.self_decoder", + decoder_layers=self.layers[:first_kv_shared_layer_idx], + layer_idx_start=0, + per_layer_model_projection=self.per_layer_model_projection, + embed_scale_per_layer=self.embed_scale_per_layer, + embed_tokens_per_layer=self.embed_tokens_per_layer, + per_layer_projection_norm=self.per_layer_projection_norm, + per_layer_input_scale=self.per_layer_input_scale, + altup_projections=self.altup_projections, + eps=self.eps, + embed_tokens=self.embed_tokens, + embed_scale=self.embed_scale, + ) + # Layer idx 20-30 are cross-decoder layers in YOCO + with set_model_tag("cross_decoder"): + self.cross_decoder = Gemma3nCrossDecoder( + vllm_config=vllm_config, + prefix=f"{prefix}.cross_decoder", + decoder_layers=self.layers[first_kv_shared_layer_idx:], + layer_idx_start=first_kv_shared_layer_idx, + ) + self.norm = RMSNorm( config.hidden_size, eps=config.rms_norm_eps, ) - self.eps = torch.tensor(torch.finfo().min) + + self.fast_prefill_enabled = cache_config.kv_sharing_fast_prefill + + if self.fast_prefill_enabled: + # Allocate static buffers for CUDAGraph + # TODO(sarckk): Extract this functionality to interface + max_num_tokens = vllm_config.scheduler_config.max_num_batched_tokens + device = next(self.parameters()).device + self.positions = torch.zeros(max_num_tokens, + dtype=torch.int64, + device=device) + self.hidden_states = torch.zeros( + (max_num_tokens, config.hidden_size, + self.config.altup_num_inputs), + dtype=self.embed_tokens.weight.dtype, + device=device, + ) + self.per_layer_inputs = torch.zeros( + (max_num_tokens, self.config.num_hidden_layers, + self.config.hidden_size_per_layer_input), + dtype=self.embed_tokens.weight.dtype, + device=device, + ) def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.embed_tokens(input_ids) * self.embed_scale + return self.self_decoder.get_input_embeddings(input_ids) - def get_per_layer_input_embeddings( - self, input_ids: torch.Tensor) -> torch.Tensor: - # Deal with the fact that vocab_size_per_layer_input < vocab_size - # which causes us to have some out of vocab tokens by setting - # those token ids to 0. This matches the HF implementation. - per_layer_inputs_mask = torch.logical_and( - input_ids >= 0, input_ids < self.config.vocab_size_per_layer_input) - per_layer_inputs_tokens = torch.where(per_layer_inputs_mask, input_ids, - torch.zeros_like(input_ids)) - return self.embed_tokens_per_layer( - per_layer_inputs_tokens) * self.embed_scale_per_layer + def fast_prefill_forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + inputs_embeds: Optional[torch.Tensor] = None, + per_layer_inputs: Optional[torch.Tensor] = None, + **kwargs, + ) -> torch.Tensor: + logits_indices_padded, num_logits_indices = None, None + attn_metadata = get_forward_context().attn_metadata + + # attn_metadata is None during dummy runs + if (self.fast_prefill_enabled and attn_metadata is not None): + assert isinstance(attn_metadata, dict) + # Last layer is a KV sharing layer + layer_attn_metadata = attn_metadata[ + self.layers[-1].self_attn.attn.layer_name] + if (isinstance(layer_attn_metadata, KVSharingFastPrefillMetadata)): + logits_indices_padded = ( + layer_attn_metadata.logits_indices_padded) + num_logits_indices = layer_attn_metadata.num_logits_indices + + # Copy inputs for cudagraph + batch_size = positions.size(0) + self.positions[:batch_size].copy_(positions) + self_decoder_hidden_states, per_layer_inputs_adjusted = \ + self.self_decoder( + input_ids=input_ids, + positions=self.positions[:batch_size], + inputs_embeds=inputs_embeds, + per_layer_inputs=per_layer_inputs, + **kwargs, + ) + + if logits_indices_padded is None: + logits_indices_padded = torch.arange( + positions.size(0), + dtype=positions.dtype, + device=positions.device, + ) + + # NOTE(sarckk): There is currently a bug caused by + # vLLM converting output of last piecewise CUDA graph + # to weakref, causing memory to be prematurely freed + # when there are multiple compilation units + # Keep .clone() until fix in + # https://github.com/vllm-project/vllm/pull/22282 + hidden_states = self_decoder_hidden_states.clone() + + # Copy inputs for cudagraph + num_padded_logits_indices = logits_indices_padded.size(0) + self.positions[:num_padded_logits_indices].copy_( + positions[logits_indices_padded]) + self.hidden_states[:num_padded_logits_indices].copy_( + self_decoder_hidden_states[logits_indices_padded]) + self.per_layer_inputs[:num_padded_logits_indices].copy_( + per_layer_inputs_adjusted[logits_indices_padded]) + cross_decoder_hidden_states = self.cross_decoder( + positions=self.positions[:num_padded_logits_indices], + hidden_states=self.hidden_states[:num_padded_logits_indices], + per_layer_inputs=self.per_layer_inputs[:num_padded_logits_indices], + **kwargs, + ) + + if num_logits_indices is not None: + assert num_logits_indices > 0 + # Merge cross-decoder and self-decoder hidden states + hidden_states[logits_indices_padded[:num_logits_indices]] = ( + cross_decoder_hidden_states[:num_logits_indices]) + else: + hidden_states = cross_decoder_hidden_states + + return hidden_states + + def normal_forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + inputs_embeds: Optional[torch.Tensor] = None, + per_layer_inputs: Optional[torch.Tensor] = None, + **kwargs, + ) -> torch.Tensor: + hidden_states, per_layer_inputs = self.self_decoder( + input_ids=input_ids, + positions=positions, + inputs_embeds=inputs_embeds, + per_layer_inputs=per_layer_inputs, + **kwargs, + ) + hidden_states = self.cross_decoder( + positions=positions, + hidden_states=hidden_states, + per_layer_inputs=per_layer_inputs, + **kwargs, + ) + return hidden_states + + def altup_unembed( + self, + hidden_states: torch.Tensor, + ) -> torch.Tensor: + # Altup unembed. + target_magnitude = torch.mean(hidden_states[..., 0]**2, + dim=-1, + keepdim=True)**0.5 + for i in range(1, self.config.altup_num_inputs): + hidden_states[..., i] = self.altup_unembed_projections[i - 1]( + hidden_states[..., i]) + new_magnitude = torch.mean(hidden_states[..., i]**2, + dim=-1, + keepdim=True)**0.5 + hidden_states[..., i] *= target_magnitude / torch.maximum( + new_magnitude, self.eps) + # [num_tokens,hidden_size, altup_num_inputs] -> [num_tokens,hidden_size] + hidden_states = torch.mean(hidden_states, dim=-1) + return hidden_states def forward( self, input_ids: Optional[torch.Tensor], positions: torch.Tensor, - per_layer_inputs: torch.Tensor, + per_layer_inputs: Optional[torch.Tensor] = None, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, **kwargs, ) -> Union[torch.Tensor, IntermediateTensors]: - if inputs_embeds is not None: - hidden_states_0 = inputs_embeds - else: - hidden_states_0 = self.get_input_embeddings(input_ids) - - per_layer_projection = self.per_layer_model_projection(hidden_states_0) - per_layer_projection = per_layer_projection.reshape( - *hidden_states_0.shape[:-1], - self.config.num_hidden_layers, - self.config.hidden_size_per_layer_input, - ) - per_layer_projection = self.per_layer_projection_norm( - per_layer_projection) - - if per_layer_inputs is not None: - # Profiling run does not compute per_layer_inputs - per_layer_inputs = per_layer_projection + per_layer_inputs - per_layer_inputs *= self.per_layer_input_scale - else: - per_layer_inputs = per_layer_projection - - # Altup embed. - hidden_states = [hidden_states_0] * self.config.altup_num_inputs - target_magnitude = torch.mean(hidden_states_0**2, dim=-1, - keepdim=True)**0.5 - for i in range(1, self.config.altup_num_inputs): - hidden_states[i] = self.altup_projections[i - 1](hidden_states[i]) - new_magnitude = torch.mean(hidden_states[i]**2, - dim=-1, - keepdim=True)**0.5 - hidden_states[i] *= target_magnitude / torch.maximum( - new_magnitude, self.eps) - hidden_states = torch.stack(hidden_states, dim=0) - - # Transformer blocks. - for layer_idx, layer in enumerate(self.layers): - # [altup_num_inputs, num_tokens, hidden_size] - hidden_states = layer( - positions=positions, - hidden_states=hidden_states, - per_layer_input=per_layer_inputs[:, layer_idx, :], + if self.fast_prefill_enabled: + hidden_states = self.fast_prefill_forward( + input_ids, + positions, + inputs_embeds, + per_layer_inputs, **kwargs, ) - - # Altup unembed. - target_magnitude = torch.mean(hidden_states[0]**2, - dim=-1, - keepdim=True)**0.5 - for i in range(1, self.config.altup_num_inputs): - hidden_states[i] = self.altup_unembed_projections[i - 1]( - hidden_states[i]) - new_magnitude = torch.mean(hidden_states[i]**2, - dim=-1, - keepdim=True)**0.5 - hidden_states[i] *= target_magnitude / torch.maximum( - new_magnitude, self.eps) - # [altup_num_inputs,num_tokens,hidden_size] -> [num_tokens,hidden_size] - hidden_states = torch.mean(hidden_states, dim=0) - + else: + hidden_states = self.normal_forward( + input_ids, + positions, + inputs_embeds, + per_layer_inputs, + **kwargs, + ) + hidden_states = self.altup_unembed(hidden_states) return self.norm(hidden_states) def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py index d59dde1560aea..aba4f98ea5f33 100644 --- a/vllm/model_executor/models/gemma3n_mm.py +++ b/vllm/model_executor/models/gemma3n_mm.py @@ -620,7 +620,7 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal): # NOTE (NickLucche) Each pass needs tokens to compute PLE so we cache # them here, as the model forward has only access to the input_embeds. if input_ids is not None: - per_layer_inputs = self.language_model.model.get_per_layer_input_embeddings( + per_layer_inputs = self.language_model.model.self_decoder.get_per_layer_input_embeddings( input_ids) per_layer_inputs = per_layer_inputs.reshape( -1, self.config.text_config.num_hidden_layers, diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 39bdbe125635b..ad53b2e80bc73 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -4,11 +4,13 @@ import abc import enum import functools from abc import abstractmethod -from dataclasses import dataclass, make_dataclass -from typing import TYPE_CHECKING, Any, ClassVar, Generic, Optional, TypeVar +from dataclasses import dataclass, fields, make_dataclass +from typing import (TYPE_CHECKING, Any, ClassVar, Generic, Optional, Protocol, + TypeVar) import numpy as np import torch +from typing_extensions import runtime_checkable from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.utils import cdiv @@ -19,7 +21,8 @@ if TYPE_CHECKING: from vllm.v1.worker.gpu_input_batch import InputBatch import vllm.envs as envs -from vllm.attention.backends.abstract import AttentionBackend +from vllm.attention.backends.abstract import (AttentionBackend, + AttentionMetadata) from vllm.attention.layer import Attention from vllm.distributed.kv_transfer.kv_connector.utils import ( get_kv_connector_cache_layout) @@ -65,6 +68,10 @@ class CommonAttentionMetadata: causal: bool = True + # Needed by FastPrefillAttentionBuilder + logits_indices_padded: Optional[torch.Tensor] = None + num_logits_indices: Optional[int] = None + @dataclass class UbatchSlice: @@ -542,6 +549,69 @@ def make_local_attention_virtual_batches( ) +def make_kv_sharing_fast_prefill_common_attn_metadata( + common_attn_metadata: CommonAttentionMetadata, +) -> CommonAttentionMetadata: + if common_attn_metadata.max_query_len == 1: + # All requests are decode (assume 1 token for now) + # Skip computing fast prefill path + return common_attn_metadata + + assert common_attn_metadata.logits_indices_padded is not None + assert common_attn_metadata.num_logits_indices is not None + + logits_indices_padded = common_attn_metadata.logits_indices_padded + num_logits_indices = common_attn_metadata.num_logits_indices + # Get rid of CUDAGraph padding, if any + logits_indices = logits_indices_padded[:num_logits_indices] + num_reqs = common_attn_metadata.num_reqs + query_start_loc = common_attn_metadata.query_start_loc + seq_lens = common_attn_metadata.seq_lens + # Example inputs + # num_reqs: 3 + # generation_indices: [14, 18, 19, 27] + # query_start_loc: [0, 15, 20, 28] + # seq_lens: [41, 31, 40] + + # Find how many decode indices belong to each request + # request_ids: [0, 1, 1, 2] + request_ids = torch.bucketize(logits_indices, + query_start_loc[1:], + right=True) + + # Figure out how many tokens are in each request + # num_decode_tokens: [1, 2, 1] + num_decode_tokens = torch.bincount(request_ids, minlength=num_reqs) + + # Calculate new query_start_loc with tokens in generation_indices + # decode_query_start_loc: [0, 1, 3, 4] + decode_query_start_loc = torch.empty(num_reqs + 1, + device=query_start_loc.device, + dtype=query_start_loc.dtype) + + decode_query_start_loc[0] = 0 + decode_query_start_loc[1:] = torch.cumsum(num_decode_tokens, dim=0) + decode_max_query_len = int(num_decode_tokens.max().item()) + total_num_decode_tokens = int(num_decode_tokens.sum().item()) + + common_attn_metadata = CommonAttentionMetadata( + query_start_loc=decode_query_start_loc, + query_start_loc_cpu=decode_query_start_loc.to("cpu", + non_blocking=True), + seq_lens=seq_lens, + seq_lens_cpu=seq_lens.to("cpu", non_blocking=True), + num_computed_tokens_cpu=common_attn_metadata.num_computed_tokens_cpu, + num_reqs=num_reqs, + num_actual_tokens=total_num_decode_tokens, + max_query_len=decode_max_query_len, + max_seq_len=common_attn_metadata.max_seq_len, + block_table_tensor=common_attn_metadata.block_table_tensor, + slot_mapping=common_attn_metadata.slot_mapping, + causal=True, + ) + return common_attn_metadata + + def subclass_attention_backend( name_prefix: str, attention_backend_cls: type[AttentionBackend], builder_cls: type[AttentionMetadataBuilder[M]] @@ -679,13 +749,56 @@ def subclass_attention_metadata( return Wrapped -def make_kv_sharing_fast_prefill_attention_metadata( - metadata_cls: Any, ) -> Any: - """ - Return a new subclass of `metadata_cls` for fast prefill - """ - return subclass_attention_metadata( - name_prefix="KVSharingFastPrefill", - metadata_cls=metadata_cls, - fields=KV_SHARING_FAST_PREFILL_METADATA_FIELDS, - ) +@runtime_checkable +class KVSharingFastPrefillMetadata(Protocol): + logits_indices_padded: torch.Tensor + num_logits_indices: int + + +def create_fast_prefill_custom_backend( + prefix: str, + underlying_attn_backend: AttentionBackend, +) -> type[AttentionBackend]: + + underlying_builder = underlying_attn_backend.get_builder_cls() + + class FastPrefillAttentionBuilder(underlying_builder): # type: ignore + + def build(self, + common_prefix_len: int, + common_attn_metadata: CommonAttentionMetadata, + fast_build: bool = False) -> AttentionMetadata: + new_common_attn_metadata =\ + make_kv_sharing_fast_prefill_common_attn_metadata(common_attn_metadata) + metadata = super().build(common_prefix_len, + new_common_attn_metadata, fast_build) + + class KVSharingFastPrefillAttentionMetadata( + metadata.__class__, # type: ignore + KVSharingFastPrefillMetadata): + + def __init__(self, metadata, common_attn_metadata): + # Shallow copy all fields in metadata cls + for field in fields(metadata.__class__): + setattr(self, field.name, + getattr(metadata, field.name)) + + # Set additional fields that will be used in model code + assert (common_attn_metadata.logits_indices_padded + is not None + and common_attn_metadata.num_logits_indices + is not None) + self.logits_indices_padded = \ + common_attn_metadata.logits_indices_padded + self.num_logits_indices = \ + common_attn_metadata.num_logits_indices + + return KVSharingFastPrefillAttentionMetadata( + metadata, common_attn_metadata) + + attn_backend = subclass_attention_backend( + name_prefix=prefix, + attention_backend_cls=underlying_attn_backend, + builder_cls=FastPrefillAttentionBuilder) + + return attn_backend diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index dbea0b610b31a..7440fe1f07e91 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -335,6 +335,13 @@ class AsyncLLM(EngineClient): returning the RequestOutput back to the caller. """ + if (self.vllm_config.cache_config.kv_sharing_fast_prefill + and sampling_params.prompt_logprobs): + raise ValueError( + "--kv-sharing-fast-prefill produces incorrect logprobs for " + "prompt tokens, please disable it when the requests need " + "prompt logprobs") + try: # We start the output_handler on the first call to generate() so # we can call __init__ before the event loop, which enables us diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a194808e513dd..0250a4e19a027 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import dataclasses import gc import itertools import time @@ -58,7 +57,7 @@ from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, supports_dynamo) from vllm.v1.attention.backends.utils import ( AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata, - make_kv_sharing_fast_prefill_attention_metadata, + create_fast_prefill_custom_backend, reorder_batch_to_split_decodes_and_prefills) from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher from vllm.v1.kv_cache_interface import (AttentionSpec, @@ -84,9 +83,10 @@ from vllm.v1.worker.kv_connector_model_runner_mixin import ( KVConnectorModelRunnerMixin, KVConnectorOutput) from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin -from .utils import (AttentionGroup, MultiModalBudget, bind_kv_cache, - gather_mm_placeholders, initialize_kv_cache_for_kv_sharing, - sanity_check_mm_encoder_outputs, scatter_mm_placeholders) +from .utils import (AttentionGroup, MultiModalBudget, + add_kv_sharing_layers_to_kv_cache_groups, bind_kv_cache, + gather_mm_placeholders, sanity_check_mm_encoder_outputs, + scatter_mm_placeholders) if TYPE_CHECKING: import xgrammar as xgr @@ -860,6 +860,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): max_seq_len=max_seq_len, block_table_tensor=blk_table_tensor, slot_mapping=slot_mapping, + logits_indices_padded=logits_indices_padded, + num_logits_indices=logits_indices.size(0), causal=True, ) @@ -884,28 +886,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): common_attn_metadata=common_attn_metadata, )) - fast_prefill_metadata = attn_metadata_i - if (self.cache_config.kv_sharing_fast_prefill - and self.kv_sharing_fast_prefill_eligible_layers): - # Dynamically create a a dataclass type that inherits - # from attention metadata type but includes additional - # fields logits_indices_padded and num_logits_indices - # which are required for prefill truncation - fast_prefill_metadata_type = ( - make_kv_sharing_fast_prefill_attention_metadata( - metadata_cls=type(attn_metadata_i), )) - fast_prefill_metadata = fast_prefill_metadata_type( - **dataclasses.asdict(attn_metadata_i), - logits_indices_padded=logits_indices_padded, - num_logits_indices=logits_indices.size(0), - ) - for layer_name in attn_group.layer_names: - if (self.cache_config.kv_sharing_fast_prefill - and layer_name - in self.kv_sharing_fast_prefill_eligible_layers): - attn_metadata[layer_name] = fast_prefill_metadata - continue attn_metadata[layer_name] = attn_metadata_i # Hot-Swap lora model @@ -1484,6 +1465,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): return self.kv_connector_no_forward(scheduler_output, self.vllm_config) + if self.cache_config.kv_sharing_fast_prefill: + assert not self.input_batch.num_prompt_logprobs, ( + "--kv-sharing-fast-prefill produces incorrect logprobs for " + "prompt tokens, tokens, please disable it when the requests " + "need prompt logprobs") + # Prepare the decoder inputs. (attn_metadata, logits_indices, spec_decode_metadata, num_scheduled_tokens_np, spec_decode_common_attn_metadata, @@ -2742,6 +2729,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # layer. for layer_name in layer_names: attn_backend = layers[layer_name].get_attn_backend() + + if layer_name in self.kv_sharing_fast_prefill_eligible_layers: + attn_backend = create_fast_prefill_custom_backend( + "FastPrefill", + attn_backend, + ) + key = attn_backend.full_cls_name() attn_backends[key] = attn_backend attn_backend_layers[key].append(layer_name) @@ -3074,20 +3068,40 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): kv_caches = self._reshape_kv_cache_tensors(kv_cache_config, kv_cache_raw_tensors) - # Setup `kv_cache_config` and `kv_caches` for models - # with cross-layer KV sharing - if self.shared_kv_cache_layers: - initialize_kv_cache_for_kv_sharing( - self.shared_kv_cache_layers, - kv_cache_config.kv_cache_groups, - kv_caches, - self.attn_groups, - self.runner_only_attn_layers, - ) + # Set up cross-layer KV cache sharing + for layer_name, target_layer_name in self.shared_kv_cache_layers.items( + ): + logger.debug("%s reuses KV cache of %s", layer_name, + target_layer_name) + kv_caches[layer_name] = kv_caches[target_layer_name] + + bind_kv_cache(kv_caches, + self.compilation_config.static_forward_context, + self.kv_caches) + return kv_caches + + def maybe_add_kv_sharing_layers_to_kv_cache_groups( + self, kv_cache_config: KVCacheConfig) -> None: + """ + Add layers that re-use KV cache to KV cache group of its target layer. + Mapping of KV cache tensors happens in `initialize_kv_cache_tensors()` + """ + if not self.shared_kv_cache_layers: + # No cross-layer KV sharing, return + return + + add_kv_sharing_layers_to_kv_cache_groups( + self.shared_kv_cache_layers, + kv_cache_config.kv_cache_groups, + self.runner_only_attn_layers, + ) + + if self.cache_config.kv_sharing_fast_prefill: + # In You Only Cache Once (https://arxiv.org/abs/2405.05254) or other + # similar KV sharing setups, only the layers that generate KV caches + # are involved in the prefill phase, enabling prefill to early exit. attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention) - # Iterate in reversed order and add layers that re-use KV cache - # e.g. in YOCO-like KV sharing setups (e.g. Gemma3n) for layer_name in reversed(attn_layers): if layer_name in self.shared_kv_cache_layers: self.kv_sharing_fast_prefill_eligible_layers.add( @@ -3095,11 +3109,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): else: break - bind_kv_cache(kv_caches, - self.compilation_config.static_forward_context, - self.kv_caches) - return kv_caches - def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: """ Initialize KV cache based on `kv_cache_config`. @@ -3111,6 +3120,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.kv_cache_config = kv_cache_config self.may_reinitialize_input_batch(kv_cache_config) self.may_add_encoder_only_layers_to_kv_cache_config() + self.maybe_add_kv_sharing_layers_to_kv_cache_groups(kv_cache_config) self.initialize_attn_backend(kv_cache_config) kv_caches = self.initialize_kv_cache_tensors(kv_cache_config) diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 70ffde39ca333..2307006127085 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -55,9 +55,8 @@ from vllm.v1.worker.kv_connector_model_runner_mixin import ( from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin from vllm.v1.worker.tpu_input_batch import CachedRequestState, InputBatch -from .utils import (MultiModalBudget, bind_kv_cache, - initialize_kv_cache_for_kv_sharing, - sanity_check_mm_encoder_outputs) +from .utils import (MultiModalBudget, add_kv_sharing_layers_to_kv_cache_groups, + bind_kv_cache, sanity_check_mm_encoder_outputs) if TYPE_CHECKING: from vllm.v1.core.sched.output import SchedulerOutput @@ -1599,6 +1598,30 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.encoder_cache.clear() gc.collect() + def maybe_setup_cross_layer_kv_sharing( + self, + kv_caches: dict[str, torch.Tensor], + kv_cache_config: KVCacheConfig, + ) -> None: + """ + Add layers that re-use KV cache to KV cache group of its target layer. + Mapping of KV cache tensors happens in `initialize_kv_cache_tensors()` + """ + if not self.shared_kv_cache_layers: + # No cross-layer KV sharing, return + return + + add_kv_sharing_layers_to_kv_cache_groups( + self.shared_kv_cache_layers, + kv_cache_config.kv_cache_groups, + ) + + for layer_name, target_layer_name in self.shared_kv_cache_layers.items( + ): + logger.debug("%s reuses KV cache of %s", layer_name, + target_layer_name) + kv_caches[layer_name] = kv_caches[target_layer_name] + def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: """ Initialize KV cache based on `kv_cache_config`. @@ -1664,14 +1687,8 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): else: raise NotImplementedError - # Setup `kv_cache_config` and `kv_caches` for models - # with cross-layer KV sharing - if self.shared_kv_cache_layers: - initialize_kv_cache_for_kv_sharing( - self.shared_kv_cache_layers, - kv_cache_config.kv_cache_groups, - kv_caches, - ) + # Set up cross-layer KV cache sharing if needed + self.maybe_setup_cross_layer_kv_sharing(kv_caches, kv_cache_config) bind_kv_cache( kv_caches, diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index a519336e41616..6767804c71b9f 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -203,12 +203,9 @@ def gather_mm_placeholders( return placeholders[is_embed] -def initialize_kv_cache_for_kv_sharing( +def add_kv_sharing_layers_to_kv_cache_groups( shared_kv_cache_layers: dict[str, str], kv_cache_groups: list[KVCacheGroupSpec], - kv_caches: dict[str, torch.Tensor], - # Optional for now to avoid breaking TPU - attn_groups: Optional[list[list[AttentionGroup]]] = None, runner_only_attn_layers: Optional[set[str]] = None, ) -> None: """ @@ -223,38 +220,15 @@ def initialize_kv_cache_for_kv_sharing( means this layer will perform attention using the keys and values from the KV cache of `shared_kv_cache_layers[layer_name]`. kv_cache_groups: The KV cache groups of the model. - kv_caches: The allocated kv_caches with layer names as keys. - Note that layers in shared_kv_cache_layers.keys() are not - originally included as it only contains layers which have its own - KV cache allocation. - attn_groups: Optional list of attention groups. Layers in the same KV - cache group may be placed in different attention groups if they - have different attention backends. Currently only provided by - GPU model runner. """ - # mapping from layer name to tuple of (kv_cache_group_idx, attn_group_idx) - layer_to_attn_group_idx: dict[str, tuple[int, int]] = {} - if attn_groups: - for kv_cache_group_idx, kv_attn_groups in enumerate(attn_groups): - for attn_group_idx, attn_group in enumerate(kv_attn_groups): - for layer_name in attn_group.layer_names: - layer_to_attn_group_idx[layer_name] = (kv_cache_group_idx, - attn_group_idx) - else: - for kv_cache_group_idx, kv_cache_group in enumerate(kv_cache_groups): - for layer_name in kv_cache_group.layer_names: - # attn group idx default to 0 if not provided - layer_to_attn_group_idx[layer_name] = (kv_cache_group_idx, 0) + layer_to_kv_cache_group: dict[str, KVCacheGroupSpec] = {} + for kv_cache_group in kv_cache_groups: + for layer_name in kv_cache_group.layer_names: + layer_to_kv_cache_group[layer_name] = kv_cache_group for layer_name, target_layer_name in shared_kv_cache_layers.items(): - kv_caches[layer_name] = kv_caches[target_layer_name] - kv_cache_group_idx = layer_to_attn_group_idx[target_layer_name][0] - kv_cache_groups[kv_cache_group_idx].layer_names.append(layer_name) - - if attn_groups: - attn_group_idx = layer_to_attn_group_idx[target_layer_name][1] - attn_groups[kv_cache_group_idx][attn_group_idx].layer_names.append( - layer_name) + tgt_kv_cache_group = layer_to_kv_cache_group[target_layer_name] + tgt_kv_cache_group.layer_names.append(layer_name) if runner_only_attn_layers is not None: runner_only_attn_layers.add(layer_name) From d3d2aad5a2a06b0ea22ae09cb0c6fb6912fa64d8 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Thu, 28 Aug 2025 18:18:10 -0400 Subject: [PATCH 08/56] [Log] Use Debug Once for DeepGEMM E8M0 When not Enabled (#23858) --- vllm/utils/deep_gemm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index cd1dbfb813fee..90cdd396209c7 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -36,7 +36,7 @@ def is_deep_gemm_e8m0_used() -> bool: "E8M0 scale on a Hopper or Blackwell-class GPU. """ if not is_deep_gemm_supported(): - logger.info_once( + logger.debug_once( "DeepGEMM E8M0 disabled: DeepGEMM not supported on this system.") return False From b668055a114086b8968d9ff4a53586f1d8ea0b47 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 28 Aug 2025 18:05:52 -0700 Subject: [PATCH 09/56] [V0 Deprecation] Remove V0 Samplers test (#23862) --- tests/samplers/test_sampler.py | 769 ------------------------- tests/samplers/test_seeded_generate.py | 86 --- 2 files changed, 855 deletions(-) delete mode 100644 tests/samplers/test_sampler.py delete mode 100644 tests/samplers/test_seeded_generate.py diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py deleted file mode 100644 index 520b88d03ac8e..0000000000000 --- a/tests/samplers/test_sampler.py +++ /dev/null @@ -1,769 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import itertools -import random -from dataclasses import dataclass -from typing import Optional -from unittest.mock import Mock, patch - -import pytest -import torch -from transformers import GenerationConfig, GenerationMixin - -import vllm.envs as envs -from vllm.model_executor.layers.sampler import Sampler -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.utils import set_random_seed -from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata -from vllm.utils import Counter, is_pin_memory_available - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - This file tests V0 internals, so set VLLM_USE_V1=0. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') - - -class MockLogitsSampler(Sampler): - - def __init__(self, fake_logits: torch.Tensor): - super().__init__() - self.fake_logits = fake_logits - - def forward(self, *args, **kwargs): - return super().forward(*args, **kwargs) - - -def _prepare_test( - batch_size: int -) -> tuple[torch.Tensor, torch.Tensor, MockLogitsSampler]: - input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16) - fake_logits = torch.full((batch_size, VOCAB_SIZE), - 1e-2, - dtype=input_tensor.dtype) - sampler = MockLogitsSampler(fake_logits) - return input_tensor, fake_logits, sampler - - -VOCAB_SIZE = 32000 -RANDOM_SEEDS = list(range(128)) -CUDA_DEVICES = [ - f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] - - -def _do_sample( - batch_size: int, - input_tensor: torch.Tensor, - sampler: MockLogitsSampler, - sampling_params: SamplingParams, - device: str, -): - seq_group_metadata_list: list[SequenceGroupMetadata] = [] - seq_lens: list[int] = [] - for i in range(batch_size): - seq_group_metadata_list.append( - SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=True, - seq_data={0: SequenceData.from_seqs([1, 2, 3])}, - sampling_params=sampling_params, - block_tables={0: [1]}, - )) - seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len()) - - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, - seq_lens, - query_lens=seq_lens, - device=device, - pin_memory=is_pin_memory_available()) - return sampler(logits=input_tensor, sampling_metadata=sampling_metadata) - - -@pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) -def test_sampler_all_greedy(seed: int, device: str): - set_random_seed(seed) - torch.set_default_device(device) - batch_size = random.randint(1, 256) - input_tensor, fake_logits, sampler = _prepare_test(batch_size) - - sampling_params = SamplingParams(temperature=0) - sampler_output = _do_sample(batch_size, fake_logits, sampler, - sampling_params, device) - expected = torch.argmax(fake_logits, dim=-1) - for i, sequence_output in enumerate(sampler_output): - for nth_output in sequence_output.samples: - assert nth_output.output_token == expected[i].item() - - -@pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) -def test_sampler_all_random(seed: int, device: str): - set_random_seed(seed) - torch.set_default_device(device) - batch_size = random.randint(1, 256) - _, fake_logits, sampler = _prepare_test(batch_size) - - for i in range(batch_size): - fake_logits[i, i] = 1e2 - - sampling_params = SamplingParams( - temperature=1.0, - n=random.randint(1, 10), - ) - sampler_output = _do_sample(batch_size, fake_logits, sampler, - sampling_params, device) - - for i, sequence_output in enumerate(sampler_output): - for nth_output in sequence_output.samples: - assert nth_output.output_token == i - - -@pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) -def test_sampler_all_random_seed(seed: int, device: str): - set_random_seed(seed) - torch.set_default_device(device) - batch_size = random.randint(1, 256) - _, fake_logits, sampler = _prepare_test(batch_size) - - for i in range(batch_size): - fake_logits[i, i] = 1e2 - - sampling_params = SamplingParams( - temperature=1.0, - n=random.randint(1, 10), - seed=random.randint(0, 10000), - ) - sampler_output = _do_sample(batch_size, fake_logits, sampler, - sampling_params, device) - - for i, sequence_output in enumerate(sampler_output): - for nth_output in sequence_output.samples: - assert nth_output.output_token == i - - -@pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) -def test_sampler_all_random_seed_deterministic(seed: int, device: str): - set_random_seed(seed) - torch.set_default_device(device) - batch_size = random.randint(1, 256) - _, fake_logits, sampler = _prepare_test(batch_size) - - sampling_params = SamplingParams( - temperature=1.0, - n=random.randint(1, 10), - seed=random.randint(0, 10000), - ) - first_sampler_output = _do_sample(batch_size, fake_logits, sampler, - sampling_params, device) - - second_sampler_output = _do_sample(batch_size, fake_logits, sampler, - sampling_params, device) - - assert first_sampler_output == second_sampler_output - - -@pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) -def test_sampler_min_tokens_penalty(seed: int, device: str): - seq_id_counter = Counter(start=random.randint(0, 100)) - set_random_seed(seed) - torch.set_default_device(device) - - def create_sampling_params(min_tokens, - eos_token_id=0, - *, - stop_token_ids: Optional[list[int]] = None, - prompt_logprobs: Optional[int] = None): - sampling_params = SamplingParams( - min_tokens=min_tokens, - max_tokens=9999, # keep higher than max of min_tokens - stop_token_ids=stop_token_ids, - # requesting prompt_logprobs changes the structure of `logits` - prompt_logprobs=prompt_logprobs, - ) - sampling_params.all_stop_token_ids.add(eos_token_id) - return sampling_params - - def create_sequence_data(num_input=3, num_generated=0): - seq_data = SequenceData.from_seqs( - random.choices(range(0, VOCAB_SIZE), k=num_input)) - if num_generated > 0: - seq_data.output_token_ids = random.choices(range(0, VOCAB_SIZE), - k=num_generated) - return seq_data - - def generate_test_case(): - # generate multiple seq groups but limit total batch size - batch_size = random.randint(1, 128) - - expected_penalization = [] - sequence_metadata_list: list[SequenceGroupMetadata] = [] - # 20% chance to generate seq group metadata list with all prompts - is_prompt = random.random() < 0.2 - while batch_size > 0: - num_seqs = 1 if is_prompt else random.randint(1, batch_size) - - eos_token_id = random.randint(0, VOCAB_SIZE - 1) - min_tokens = random.randint(0, 50) - num_stop_tokens = random.randint(0, 8) - if num_stop_tokens > 0: - stop_token_ids = random.choices(range(0, VOCAB_SIZE - 1), - k=num_stop_tokens) - else: - stop_token_ids = None - - sampling_params = create_sampling_params( - min_tokens=min_tokens, - eos_token_id=eos_token_id, - stop_token_ids=stop_token_ids) - - seq_data: dict[int, SequenceData] = {} - seq_group_penalization: list[bool] = [] - for _ in range(num_seqs): - num_input = random.randint(1, 100) - num_generated = 0 if is_prompt else random.randint(1, 100) - seq_data[next(seq_id_counter)] = create_sequence_data( - num_input=num_input, num_generated=num_generated) - seq_group_penalization.append(num_generated < min_tokens) - - expected_penalization.extend(seq_group_penalization) - sequence_metadata_list.append( - SequenceGroupMetadata( - request_id=f"test_{batch_size}", - is_prompt=is_prompt, - seq_data=seq_data, - sampling_params=sampling_params, - block_tables={}, - )) - batch_size -= num_seqs - - return { - "expected_penalization": expected_penalization, - "seq_group_metadata_list": sequence_metadata_list, - } - - # define some explicit test cases for edge case behavior - prompt_without_penalization = { - "expected_penalization": [False], - "seq_group_metadata_list": [ - SequenceGroupMetadata( - request_id="test_1", - is_prompt=True, - seq_data={ - next(seq_id_counter): create_sequence_data(), - }, - sampling_params=create_sampling_params(0), - block_tables={}, - ), - ] - } - - prompt_with_penalization = { - "expected_penalization": [True], - "seq_group_metadata_list": [ - SequenceGroupMetadata( - request_id="test_1", - is_prompt=True, - seq_data={ - next(seq_id_counter): create_sequence_data(), - }, - sampling_params=create_sampling_params(1), - block_tables={}, - ), - ] - } - - prompt_with_penalization_and_prompt_logprobs = { - "expected_penalization": [False, False, True], - "seq_group_metadata_list": [ - SequenceGroupMetadata( - request_id="test_1", - is_prompt=True, - seq_data={ - next(seq_id_counter): create_sequence_data(num_input=3), - }, - sampling_params=create_sampling_params(1, prompt_logprobs=3), - block_tables={}, - ), - ] - } - - stop_penalizing_after_min_tokens = { - "expected_penalization": [False], - "seq_group_metadata_list": [ - SequenceGroupMetadata( - request_id="test_1", - is_prompt=False, - seq_data={ - next(seq_id_counter): - create_sequence_data(num_generated=1), - }, - sampling_params=create_sampling_params(1), - block_tables={}, - ) - ] - } - - stop_token_ids = [42, 99, 42, 0] # intentional duplication - prompt_combination = { - "expected_penalization": [False, True, False], - "seq_group_metadata_list": [ - SequenceGroupMetadata( - request_id="test_2", - is_prompt=True, - seq_data={ - next(seq_id_counter): create_sequence_data(num_input=2), - }, - sampling_params=create_sampling_params(1, prompt_logprobs=3), - block_tables={}, - ), - SequenceGroupMetadata( - request_id="test_3", - is_prompt=True, - seq_data={ - next(seq_id_counter): create_sequence_data(), - }, - sampling_params=create_sampling_params( - 0, stop_token_ids=stop_token_ids), - block_tables={}, - ) - ] - } - - stop_token_ids = [1, 999, 37, 37] # intentional duplication - decode_combination = { - "expected_penalization": [True, False, False, True, False], - "seq_group_metadata_list": [ - SequenceGroupMetadata( - request_id="test_1", - is_prompt=False, - seq_data={ - next(seq_id_counter): - create_sequence_data(num_generated=1), - next(seq_id_counter): - create_sequence_data(num_generated=100), - }, - sampling_params=create_sampling_params( - 2, stop_token_ids=stop_token_ids), - block_tables={}, - ), - SequenceGroupMetadata( - request_id="test_2", - is_prompt=False, - seq_data={ - next(seq_id_counter): - create_sequence_data(num_generated=20), - next(seq_id_counter): - create_sequence_data(num_generated=1), - next(seq_id_counter): - create_sequence_data(num_generated=10), - }, - sampling_params=create_sampling_params( - 10, prompt_logprobs=5, stop_token_ids=stop_token_ids), - block_tables={}, - ), - ] - } - - if seed == 0: - test_cases = [ - prompt_without_penalization, - prompt_with_penalization, - prompt_with_penalization_and_prompt_logprobs, - stop_penalizing_after_min_tokens, - prompt_combination, - decode_combination, - ] - else: - test_cases = [generate_test_case()] - - def run_test_case(*, expected_penalization: list[bool], - seq_group_metadata_list: list[SequenceGroupMetadata]): - assert expected_penalization, \ - "Invalid test case, need expected_penalization" - assert seq_group_metadata_list, \ - "Invalid test case, need seq_group_metadata_list" - - batch_size = 0 - seq_lens: list[int] = [] - sampling_params_per_row: list[SamplingParams] = [] - for sgm in seq_group_metadata_list: - sampling_params = sgm.sampling_params - - num_rows = len(sgm.seq_data) - if sgm.is_prompt: - # a prompt seq_group has only one sequence - seq_data = next(iter(sgm.seq_data.values())) - prompt_len = seq_data.get_prompt_len() - seq_lens.append(prompt_len) - - assert sgm.sampling_params is not None - if sgm.sampling_params.prompt_logprobs: - # with prompt_logprobs each token in the prompt has a row in - # logits - num_rows = prompt_len - - batch_size += num_rows - sampling_params_per_row.extend( - itertools.repeat(sampling_params, num_rows)) - - assert len( - expected_penalization - ) == batch_size, \ - ("Invalid test case, expected_penalization does not match computed" - "batch size") - - _, fake_logits, sampler = _prepare_test(batch_size) - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, - seq_lens=seq_lens if seq_lens else None, - query_lens=seq_lens if seq_lens else [1] * batch_size, - device=device, - pin_memory=is_pin_memory_available()) - # the logits tensor is modified in-place by the sampler - _ = sampler(logits=fake_logits, sampling_metadata=sampling_metadata) - - for logits_idx, (should_penalize, sampling_params) in enumerate( - zip(expected_penalization, sampling_params_per_row)): - - tokens_to_check = sampling_params.all_stop_token_ids - - if should_penalize: - for token_id in tokens_to_check: - assert fake_logits[logits_idx, token_id] == -float( - 'inf' - ), f"Expected token {token_id} for logits row {logits_idx}" - " to be penalized" - # no other tokens should be set to -inf - assert torch.count_nonzero( - fake_logits[logits_idx, :] == -float('inf')) == len( - tokens_to_check - ), f"Expected only {len(tokens_to_check)} to be penalized" - else: - # no tokens should be set to -inf - assert torch.count_nonzero( - fake_logits[logits_idx, :] == - -float('inf')) == 0, "No tokens should have been penalized" - - for test_case in test_cases: - run_test_case(**test_case) - - -@pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) -def test_sampler_mixed(seed: int, device: str): - set_random_seed(seed) - torch.set_default_device(device) - batch_size = random.randint(1, 256) - input_tensor, fake_logits, sampler = _prepare_test(batch_size) - - seq_group_metadata_list: list[SequenceGroupMetadata] = [] - expected_tokens: list[Optional[list[int]]] = [] - seq_lens: list[int] = [] - for i in range(batch_size): - expected: Optional[list[int]] = None - sampling_type = random.randint(0, 2) - if sampling_type == 0: - sampling_params = SamplingParams(temperature=0) - expected = [int(torch.argmax(fake_logits[i], dim=-1).item())] - elif sampling_type in (1, 2): - n = random.randint(1, 10) - sampling_params = SamplingParams( - temperature=random.random() + 0.1, - top_p=min(random.random() + 0.1, 1), - top_k=random.randint(0, 10), - n=n, - presence_penalty=random.randint(0, 1), - ) - if sampling_type == 2: - sampling_params.seed = random.randint(0, 10000) - else: - for idx in range(n): - fake_logits[i, i + idx] = 1e2 - expected = list(range(i, i + n)) - - expected_tokens.append(expected) - seq_group_metadata_list.append( - SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=True, - seq_data={0: SequenceData.from_seqs([1, 2, 3])}, - sampling_params=sampling_params, - block_tables={0: [1]}, - )) - seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len()) - - generators: dict[str, torch.Generator] = {} - - def test_sampling(): - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, - seq_lens, - query_lens=seq_lens, - device=device, - pin_memory=is_pin_memory_available(), - generators=generators) - sampler_output = sampler(logits=fake_logits, - sampling_metadata=sampling_metadata) - - for i, (sequence_output, metadata) in enumerate( - zip(sampler_output, seq_group_metadata_list)): - assert metadata.sampling_params is not None - - if (metadata.sampling_params.seed is not None - and expected_tokens[i] is None): - # Record seeded random result to compare with results of - # second invocation - expected_tokens[i] = [ - nth_output.output_token - for nth_output in sequence_output.samples - ] - continue - - expected_tokens_item = expected_tokens[i] - assert expected_tokens_item is not None - - for n, nth_output in enumerate(sequence_output.samples): - assert metadata.sampling_params is not None - - if (metadata.sampling_params.temperature == 0 - or metadata.sampling_params.seed is not None): - # Ensure exact matches for greedy or random with seed - assert nth_output.output_token == expected_tokens_item[n] - else: - # For non-seeded random check that one of the high-logit - # tokens were chosen - assert nth_output.output_token in expected_tokens_item - - # Test batch - test_sampling() - - # Shuffle the batch and resample - target_index = list(range(batch_size)) - for list_to_shuffle in (target_index, seq_group_metadata_list, - expected_tokens, seq_lens): - random.Random(seed).shuffle(list_to_shuffle) - target_index = torch.tensor(target_index) - input_tensor.data = input_tensor.index_select(0, target_index) - fake_logits.data = fake_logits.index_select(0, target_index) - - # This time, results of seeded random samples will be compared with - # the corresponding sample in the pre-shuffled batch - test_sampling() - - -@pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) -def test_sampler_top_k_top_p(seed: int, device: str): - set_random_seed(seed) - batch_size = random.randint(1, 256) - top_k = random.randint(100, 500) - top_p = random.random() * 0.1 - vocab_size = 32000 - input_tensor = torch.rand((batch_size, 1024), - device=device, - dtype=torch.float16) - fake_logits = torch.normal(0, - 5, - size=(batch_size, vocab_size), - device=input_tensor.device, - dtype=input_tensor.dtype) - sampler = MockLogitsSampler(fake_logits) - - generation_model = GenerationMixin() - generation_config = GenerationConfig(top_k=top_k, - top_p=top_p, - do_sample=True) - - @dataclass - class MockConfig: - is_encoder_decoder: bool = False - - generation_model.config = MockConfig() # needed by the following method - generation_model._prepare_special_tokens(generation_config, device=device) - processors = generation_model._get_logits_processor(generation_config, - None, - None, - None, [], - device=device) - assert len(processors) == 2 # top_p and top_k - - seq_group_metadata_list: list[SequenceGroupMetadata] = [] - seq_lens: list[int] = [] - for i in range(batch_size): - seq_group_metadata_list.append( - SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=True, - seq_data={0: SequenceData.from_seqs([1, 2, 3])}, - sampling_params=SamplingParams( - temperature=1, - top_k=top_k, - top_p=top_p, - ), - block_tables={0: [1]}, - )) - seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len()) - - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, - seq_lens, - query_lens=seq_lens, - device=device, - pin_memory=is_pin_memory_available()) - - sample_probs = None - - def mock_sample(probs, *args, **kwargs): - nonlocal sample_probs - sample_probs = probs - return ([[prob.topk(1, dim=-1).indices.tolist(), [0]] - for prob in probs], None) - - # top-k and top-p is only calculated when flashinfer kernel is not available - with patch("vllm.model_executor.layers.sampler._sample", mock_sample), \ - patch("vllm.model_executor.layers.sampler." - "flashinfer_top_k_top_p_sampling", None): - sampler(logits=fake_logits, sampling_metadata=sampling_metadata) - - assert sample_probs is not None - - hf_probs = processors(torch.zeros_like(fake_logits), fake_logits.clone()) - hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float) - torch.testing.assert_close(hf_probs, sample_probs, rtol=0.0, atol=1e-5) - assert torch.equal(hf_probs.eq(0), sample_probs.eq(0)) - - -@pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) -def test_flashinfer_fallback(seed: int, device: str): - if not envs.VLLM_USE_FLASHINFER_SAMPLER: - pytest.skip("Flashinfer sampler is disabled") - - pytest.skip("After FlashInfer 0.2.3, sampling will never fail") - - set_random_seed(seed) - torch.set_default_device(device) - batch_size = random.randint(1, 256) - _, fake_logits, sampler = _prepare_test(batch_size) - - def failing_flashinfer_sampling(*_args, **_kwargs): - return None, torch.zeros(batch_size, device=device, dtype=torch.int32) - - sampling_params = SamplingParams( - temperature=1.0, - n=random.randint(1, 10), - seed=random.randint(0, 10000), - ) - sampler_output = _do_sample(batch_size, fake_logits, sampler, - sampling_params, device) - - with patch( - "vllm.model_executor.layers.sampler." - "flashinfer_top_k_top_p_sampling", failing_flashinfer_sampling): - fallback_sampler_output = _do_sample(batch_size, fake_logits, sampler, - sampling_params, device) - - assert sampler_output == fallback_sampler_output - - -@pytest.mark.parametrize("device", CUDA_DEVICES) -def test_sampler_repetition_penalty_mixed(device: str): - - vocab_size = 8 - - def test_sampling_params(sampling_params: list[SamplingParams]): - - seq_group_metadata_list: list[SequenceGroupMetadata] = [] - seq_lens: list[int] = [] - for i in range(2): - seq_group_metadata_list.append( - SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=True, - seq_data={0: SequenceData.from_seqs([1, 2, 3])}, - sampling_params=sampling_params[i], - block_tables={0: [1]}, - )) - seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len()) - - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, - seq_lens, - query_lens=seq_lens, - device=device, - pin_memory=is_pin_memory_available()) - - fake_logits = torch.full((2, vocab_size), - 1e-2, - device=device, - dtype=torch.float16) - - fake_logits[:, 5] = 1.1e-2 - fake_logits[:, 1] = 1.2e-2 - - sampler = MockLogitsSampler(fake_logits) - - sampler_output = sampler(logits=fake_logits, - sampling_metadata=sampling_metadata) - - generated_tokens = [] - for output in sampler_output: - generated_tokens.append(output.samples[0].output_token) - - return generated_tokens - - # one configuration is greedy with repetition_penalty - sampling_params_rep = SamplingParams( - temperature=0.0, - repetition_penalty=2.0, - ) - - # other configuration is sampling w/o repetition_penalty - sampling_params_sample = SamplingParams( - temperature=1.0, - top_k=1, - seed=42, - ) - - tokens1 = test_sampling_params( - [sampling_params_rep, sampling_params_sample]) - - tokens2 = test_sampling_params( - [sampling_params_sample, sampling_params_rep]) - - assert tokens1[0] == tokens2[1] - assert tokens1[1] == tokens2[0] - - -@pytest.mark.parametrize("device", CUDA_DEVICES) -def test_sampler_include_gpu_probs_tensor(device: str): - set_random_seed(42) - torch.set_default_device(device) - batch_size = random.randint(1, 256) - _, fake_logits, sampler = _prepare_test(batch_size) - sampler.include_gpu_probs_tensor = True - sampler.should_modify_greedy_probs_inplace = False - - sampling_params = SamplingParams(temperature=0) - - mock_inplace = Mock() - with patch( - "vllm.model_executor.layers.sampler._modify_greedy_probs_inplace", - mock_inplace): - - sampler_output = _do_sample(batch_size, fake_logits, sampler, - sampling_params, device) - mock_inplace.assert_not_called() - - assert sampler_output.sampled_token_probs is not None - assert sampler_output.logprobs is not None - assert sampler_output.sampled_token_ids is not None diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py deleted file mode 100644 index 5a0efd98acc16..0000000000000 --- a/tests/samplers/test_seeded_generate.py +++ /dev/null @@ -1,86 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Verify that seeded random sampling is deterministic. - -Run `pytest tests/samplers/test_seeded_generate.py`. -""" -import copy -import random -from itertools import combinations - -import pytest - -from vllm import SamplingParams -from vllm.model_executor.utils import set_random_seed - -MODEL = "facebook/opt-125m" -RANDOM_SEEDS = list(range(5)) - - -@pytest.fixture -def vllm_model(vllm_runner, monkeypatch): - # This file relies on V0 internals. - monkeypatch.setenv("VLLM_USE_V1", "0") - with vllm_runner(MODEL, dtype="half") as vllm_model: - yield vllm_model - - -@pytest.mark.parametrize("seed", RANDOM_SEEDS) -def test_random_sample_with_seed( - vllm_model, - example_prompts, - seed: int, -) -> None: - set_random_seed(seed) - - sampling_params = SamplingParams( - # Parameters to ensure sufficient randomness - temperature=3.0, - top_p=min(random.random() + 0.3, 1), - top_k=random.randint(5, 20), - n=random.randint(1, 10), - presence_penalty=random.randint(0, 1), - max_tokens=8, - ignore_eos=True, - ) - - sampling_params_seed_1 = copy.deepcopy(sampling_params) - sampling_params_seed_1.seed = 100 - sampling_params_seed_2 = copy.deepcopy(sampling_params) - sampling_params_seed_2.seed = 200 - - llm = vllm_model.llm - - for prompt in example_prompts: - for params in ( - sampling_params, - sampling_params_seed_1, - sampling_params_seed_2, - sampling_params, - sampling_params_seed_1, - sampling_params_seed_2, - ): - llm._add_request(prompt, params=params) - - results = llm._run_engine(use_tqdm=False) - all_outputs = [[out.token_ids for out in output.outputs] - for output in results] - - for i in range(0, len(example_prompts), 6): - outputs = all_outputs[i:i + 6] - - # verify all non-seeded requests differ - for output_a, output_b in combinations( - (outputs[0], outputs[1], outputs[2], outputs[3]), - 2, - ): - assert output_a != output_b - - # verify requests with the same seed match - assert outputs[1] == outputs[4] - assert outputs[2] == outputs[5] - - # verify generations within the same parallel sampling group differ - for output in outputs: - for sub_output_a, sub_output_b in combinations(output, 2): - assert sub_output_a != sub_output_b From 235c9db8a755e0404628a568bf29a492257fe52e Mon Sep 17 00:00:00 2001 From: Chaojun Zhang Date: Fri, 29 Aug 2025 09:23:04 +0800 Subject: [PATCH 10/56] [XPU] support data parallel for MoE models on XPU (#22887) Signed-off-by: chzhang --- .../device_communicators/xpu_communicator.py | 11 +++++++++++ vllm/model_executor/layers/fused_moe/layer.py | 2 ++ 2 files changed, 13 insertions(+) diff --git a/vllm/distributed/device_communicators/xpu_communicator.py b/vllm/distributed/device_communicators/xpu_communicator.py index dee5ed7a28830..067315deb773d 100644 --- a/vllm/distributed/device_communicators/xpu_communicator.py +++ b/vllm/distributed/device_communicators/xpu_communicator.py @@ -7,8 +7,13 @@ import torch import torch.distributed as dist from torch.distributed import ProcessGroup +import vllm.envs as envs +from vllm.logger import init_logger + from .base_device_communicator import DeviceCommunicatorBase +logger = init_logger(__name__) + class XpuCommunicator(DeviceCommunicatorBase): @@ -18,6 +23,12 @@ class XpuCommunicator(DeviceCommunicatorBase): device_group: Optional[ProcessGroup] = None, unique_name: str = ""): super().__init__(cpu_group, device, device_group, unique_name) + if self.use_all2all: + all2all_backend = envs.VLLM_ALL2ALL_BACKEND + if all2all_backend == "naive": + from .all2all import NaiveAll2AllManager + self.all2all_manager = NaiveAll2AllManager(self.cpu_group) + logger.info("Using naive all2all manager.") def all_reduce(self, input_) -> torch.Tensor: dist.all_reduce(input_, group=self.device_group) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 28123d3958adc..5a87763c07211 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -655,6 +655,8 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): forward_native = forward_tpu elif current_platform.is_cpu(): forward_native = forward_cpu + elif current_platform.is_xpu(): + forward_native = forward_xpu else: forward_native = forward_cuda From de533ab2a14192e461900a4950e2b426d99a6862 Mon Sep 17 00:00:00 2001 From: Lukas Geiger Date: Fri, 29 Aug 2025 02:26:34 +0100 Subject: [PATCH 11/56] [Models] Improve iteration over layers (#19497) Signed-off-by: Lukas Geiger --- vllm/model_executor/models/arcee.py | 3 ++- vllm/model_executor/models/arctic.py | 3 ++- vllm/model_executor/models/baichuan.py | 3 ++- vllm/model_executor/models/bailing_moe.py | 4 ++-- vllm/model_executor/models/bamba.py | 3 +-- vllm/model_executor/models/bloom.py | 3 ++- vllm/model_executor/models/chameleon.py | 3 ++- vllm/model_executor/models/chatglm.py | 3 ++- vllm/model_executor/models/commandr.py | 3 ++- vllm/model_executor/models/dbrx.py | 3 ++- vllm/model_executor/models/deepseek.py | 5 +++-- vllm/model_executor/models/deepseek_v2.py | 3 ++- vllm/model_executor/models/dots1.py | 3 ++- vllm/model_executor/models/ernie45_moe.py | 4 ++-- vllm/model_executor/models/ernie45_vl_moe.py | 4 ++-- vllm/model_executor/models/exaone.py | 3 ++- vllm/model_executor/models/exaone4.py | 3 ++- vllm/model_executor/models/falcon.py | 3 ++- vllm/model_executor/models/gemma.py | 3 ++- vllm/model_executor/models/gemma2.py | 3 ++- vllm/model_executor/models/gemma3.py | 3 ++- vllm/model_executor/models/glm4_moe.py | 4 ++-- vllm/model_executor/models/gpt2.py | 3 ++- vllm/model_executor/models/gpt_bigcode.py | 3 ++- vllm/model_executor/models/gpt_j.py | 5 +++-- vllm/model_executor/models/gpt_neox.py | 3 ++- vllm/model_executor/models/granite.py | 3 ++- vllm/model_executor/models/granitemoe.py | 3 ++- vllm/model_executor/models/granitemoehybrid.py | 3 +-- vllm/model_executor/models/granitemoeshared.py | 4 ++-- vllm/model_executor/models/grok1.py | 4 ++-- vllm/model_executor/models/internlm2.py | 3 ++- vllm/model_executor/models/internlm2_ve.py | 3 ++- vllm/model_executor/models/jais.py | 3 ++- vllm/model_executor/models/jamba.py | 3 ++- vllm/model_executor/models/lfm2.py | 5 +++-- vllm/model_executor/models/llama.py | 3 ++- vllm/model_executor/models/mamba2.py | 4 +--- vllm/model_executor/models/mimo.py | 3 ++- vllm/model_executor/models/minicpm.py | 3 ++- vllm/model_executor/models/minimax_text_01.py | 4 ++-- vllm/model_executor/models/mixtral.py | 3 ++- vllm/model_executor/models/mixtral_quant.py | 3 ++- vllm/model_executor/models/molmo.py | 3 ++- vllm/model_executor/models/mpt.py | 3 ++- vllm/model_executor/models/nemotron.py | 3 ++- vllm/model_executor/models/nemotron_h.py | 3 +-- vllm/model_executor/models/nemotron_nas.py | 4 ++-- vllm/model_executor/models/olmo.py | 3 ++- vllm/model_executor/models/olmo2.py | 3 ++- vllm/model_executor/models/olmoe.py | 3 ++- vllm/model_executor/models/opt.py | 3 ++- vllm/model_executor/models/orion.py | 3 ++- vllm/model_executor/models/persimmon.py | 3 ++- vllm/model_executor/models/phi.py | 3 ++- vllm/model_executor/models/phimoe.py | 3 ++- vllm/model_executor/models/plamo2.py | 3 ++- vllm/model_executor/models/qwen.py | 3 ++- vllm/model_executor/models/qwen2.py | 3 ++- vllm/model_executor/models/qwen2_moe.py | 3 ++- vllm/model_executor/models/qwen3_moe.py | 4 ++-- vllm/model_executor/models/seed_oss.py | 3 ++- vllm/model_executor/models/stablelm.py | 3 ++- vllm/model_executor/models/starcoder2.py | 3 ++- vllm/model_executor/models/step3_text.py | 4 ++-- 65 files changed, 129 insertions(+), 83 deletions(-) diff --git a/vllm/model_executor/models/arcee.py b/vllm/model_executor/models/arcee.py index 4cf73e2e0ea56..13ed4da0602ad 100644 --- a/vllm/model_executor/models/arcee.py +++ b/vllm/model_executor/models/arcee.py @@ -9,6 +9,7 @@ # activation. from collections.abc import Iterable +from itertools import islice from typing import Any, Optional, Union import torch @@ -243,7 +244,7 @@ class ArceeModel(nn.Module): aux_hidden_states: list[torch.Tensor] = [] for idx, layer in enumerate( - self.layers[self.start_layer:self.end_layer]): + islice(self.layers, self.start_layer, self.end_layer)): if idx in self.aux_hidden_state_layers: aux_hidden_states.append( hidden_states + diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index 4693c9487a8bf..c566611266af7 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Inference-only Snowflake Arctic model.""" from collections.abc import Iterable +from itertools import islice from typing import Optional, Union import torch @@ -403,7 +404,7 @@ class ArcticModel(nn.Module): else: assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states = layer(positions, hidden_states) if not get_pp_group().is_last_rank: return IntermediateTensors({"hidden_states": hidden_states}) diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 804a2f1785d5c..4563c356666ac 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -22,6 +22,7 @@ """Inference-only BaiChuan model compatible with HuggingFace weights.""" import math from collections.abc import Iterable +from itertools import islice from typing import Optional, Union import torch @@ -309,7 +310,7 @@ class BaiChuanModel(nn.Module): assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states, residual = layer( positions, hidden_states, diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py index 23cab3509ca82..a42640cef9d44 100644 --- a/vllm/model_executor/models/bailing_moe.py +++ b/vllm/model_executor/models/bailing_moe.py @@ -24,6 +24,7 @@ # limitations under the License. """Inference-only BailingMoE model compatible with HuggingFace weights.""" from collections.abc import Iterable +from itertools import islice from typing import Optional, Union import torch @@ -359,8 +360,7 @@ class BailingMoeModel(nn.Module): hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for i in range(self.start_layer, self.end_layer): - layer = self.layers[i] + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states, residual = layer( hidden_states, position_ids, diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py index e2cd31af5390a..a72bbdebe5317 100644 --- a/vllm/model_executor/models/bamba.py +++ b/vllm/model_executor/models/bamba.py @@ -345,8 +345,7 @@ class BambaModel(nn.Module): residual = None num_attn = 0 - for i in range(len(self.layers)): - layer = self.layers[i] + for i, layer in enumerate(self.layers): if isinstance(layer, BambaAttentionDecoderLayer): num_attn += 1 diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index 126404584892f..13ecda0122be6 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -20,6 +20,7 @@ """Inference-only BLOOM model compatible with HuggingFace weights.""" import math from collections.abc import Iterable +from itertools import islice from typing import Optional, Union import torch @@ -273,7 +274,7 @@ class BloomModel(nn.Module): else: assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] - for layer in self.h[self.start_layer:self.end_layer]: + for layer in islice(self.h, self.start_layer, self.end_layer): hidden_states = layer(position_ids, hidden_states) if not get_pp_group().is_last_rank: return IntermediateTensors({"hidden_states": hidden_states}) diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index e6914ad4c495d..28a1a66c23291 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -3,6 +3,7 @@ from collections.abc import Iterable, Mapping, Sequence from functools import cached_property +from itertools import islice from typing import Annotated, Any, Literal, Optional, Union import torch @@ -914,7 +915,7 @@ class ChameleonModel(nn.Module): assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states, residual = layer( positions, hidden_states, diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 5470ff3e8b612..1fc2da3e4d7ca 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -5,6 +5,7 @@ """Inference-only ChatGLM model compatible with THUDM weights.""" import json from collections.abc import Iterable +from itertools import islice from typing import Optional, Union import torch @@ -281,7 +282,7 @@ class GLMTransformer(nn.Module): hidden_states: torch.Tensor, position_ids: torch.Tensor, ) -> Union[torch.Tensor, IntermediateTensors]: - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states = layer(hidden_states=hidden_states, position_ids=position_ids) diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 4dd84b8f8fdd5..7f87e31abdcd3 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -23,6 +23,7 @@ # This file is based on the LLama model definition file in transformers """PyTorch Cohere model.""" from collections.abc import Iterable +from itertools import islice from typing import Optional, Union import torch @@ -322,7 +323,7 @@ class CohereModel(nn.Module): assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states, residual = layer( positions, hidden_states, diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index e74d90e0b1d7d..519cd522213b2 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable +from itertools import islice from typing import Optional, Union import torch @@ -359,7 +360,7 @@ class DbrxModel(nn.Module): else: assert intermediate_tensors hidden_states = intermediate_tensors["hidden_states"] - for block in self.blocks[self.start_layer:self.end_layer]: + for block in islice(self.blocks, self.start_layer, self.end_layer): hidden_states = block(position_ids, hidden_states) if not get_pp_group().is_last_rank: return IntermediateTensors({"hidden_states": hidden_states}) diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index 2f0202f1e038d..e815f13d66dcc 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -24,6 +24,7 @@ # limitations under the License. """Inference-only Deepseek model.""" from collections.abc import Iterable +from itertools import islice from typing import Any, Optional, Union import torch @@ -377,7 +378,7 @@ class DeepseekModel(nn.Module): else: hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states, residual = layer(positions, hidden_states, residual) if not get_pp_group().is_last_rank: return IntermediateTensors({ @@ -483,4 +484,4 @@ class DeepseekForCausalLM(nn.Module, SupportsPP): def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) - return loader.load_weights(weights) \ No newline at end of file + return loader.load_weights(weights) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 7657e7cb003d6..ed033954f7c08 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -25,6 +25,7 @@ """Inference-only DeepseekV2/DeepseekV3 model.""" import typing from collections.abc import Callable, Iterable +from itertools import islice from typing import Any, Optional, Union import torch @@ -712,7 +713,7 @@ class DeepseekV2Model(nn.Module): hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states, residual = layer(positions, hidden_states, residual) if not get_pp_group().is_last_rank: diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py index 5f410c0ae5fb0..c386f8db9eec6 100644 --- a/vllm/model_executor/models/dots1.py +++ b/vllm/model_executor/models/dots1.py @@ -25,6 +25,7 @@ # limitations under the License. """Inference-only dots1 model.""" from collections.abc import Iterable +from itertools import islice from typing import Any, Optional, Union import torch @@ -391,7 +392,7 @@ class Dots1Model(nn.Module): assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states, residual = layer( positions, hidden_states, diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py index 4780ea931ea50..33ec27fc630e0 100644 --- a/vllm/model_executor/models/ernie45_moe.py +++ b/vllm/model_executor/models/ernie45_moe.py @@ -23,6 +23,7 @@ # limitations under the License. """Inference-only ErineMoE model compatible with HuggingFace weights.""" from collections.abc import Iterable +from itertools import islice from typing import Any, Optional, Union import torch @@ -419,8 +420,7 @@ class Ernie4_5_MoeModel(nn.Module): hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for i in range(self.start_layer, self.end_layer): - layer = self.layers[i] + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states, residual = layer(positions, hidden_states, residual) if not get_pp_group().is_last_rank: diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py index f56c098435154..780974c3b758e 100644 --- a/vllm/model_executor/models/ernie45_vl_moe.py +++ b/vllm/model_executor/models/ernie45_vl_moe.py @@ -23,6 +23,7 @@ # limitations under the License. """Inference-only Erine VL model compatible with HuggingFace weights.""" from collections.abc import Iterable +from itertools import islice from typing import Any, Optional, Union import torch @@ -508,8 +509,7 @@ class Ernie4_5_VLMoeModel(nn.Module): hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for i in range(self.start_layer, self.end_layer): - layer = self.layers[i] + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states, residual = layer(positions, hidden_states, residual, visual_token_mask, **kwargs) diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 8052b6bb82348..942db0143a457 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -26,6 +26,7 @@ """Inference-only Exaone model compatible with HuggingFace weights.""" from collections.abc import Iterable +from itertools import islice from typing import Any, Optional, Union import torch @@ -371,7 +372,7 @@ class ExaoneModel(nn.Module): hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for layer in self.h[self.start_layer:self.end_layer]: + for layer in islice(self.h, self.start_layer, self.end_layer): hidden_states, residual = layer( positions, hidden_states, diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py index 827e9014184b5..971fcbd2aa275 100644 --- a/vllm/model_executor/models/exaone4.py +++ b/vllm/model_executor/models/exaone4.py @@ -22,6 +22,7 @@ """Inference-only Exaone model compatible with HuggingFace weights.""" from collections.abc import Iterable +from itertools import islice from typing import Any, Optional, Union import torch @@ -354,7 +355,7 @@ class Exaone4Model(nn.Module): hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states, residual = layer( positions, hidden_states, diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index 62a93dabd5d7f..a9fe0924babd8 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -22,6 +22,7 @@ import math from collections.abc import Iterable +from itertools import islice from typing import Optional, Union import torch @@ -389,7 +390,7 @@ class FalconModel(nn.Module): hidden_states = self.get_input_embeddings(input_ids) else: hidden_states = intermediate_tensors["hidden_states"] - for layer in self.h[self.start_layer:self.end_layer]: + for layer in islice(self.h, self.start_layer, self.end_layer): hidden_states = layer(positions, hidden_states) if not get_pp_group().is_last_rank: return IntermediateTensors({"hidden_states": hidden_states}) diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 59c3102add4c7..12eb27503870c 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -18,6 +18,7 @@ """Inference-only Gemma model compatible with HuggingFace weights.""" from collections.abc import Iterable from functools import cache +from itertools import islice from typing import Optional, Union import torch @@ -308,7 +309,7 @@ class GemmaModel(nn.Module): else: hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states, residual = layer( positions, hidden_states, diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index 8cfe92c64540f..0bdb6c6bf7ae9 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -17,6 +17,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from collections.abc import Iterable +from itertools import islice from typing import Optional, Union import torch @@ -292,7 +293,7 @@ class Gemma2Model(nn.Module): assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states, residual = layer( positions, hidden_states, diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index b762be3c52925..410c715d5241b 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -16,6 +16,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from collections.abc import Iterable +from itertools import islice from typing import Optional, Union import torch @@ -398,7 +399,7 @@ class Gemma3Model(nn.Module): assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states, residual = layer( positions, hidden_states, diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index fe5e46a99826f..fcc63815ac56f 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -24,6 +24,7 @@ """Inference-only GLM-4.5 model compatible with HuggingFace weights.""" import typing from collections.abc import Callable, Iterable +from itertools import islice from typing import Any, Optional, Union import torch @@ -440,8 +441,7 @@ class Glm4MoeModel(nn.Module): hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for i in range(self.start_layer, self.end_layer): - layer = self.layers[i] + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states, residual = layer(positions, hidden_states, residual) if not get_pp_group().is_last_rank: diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 98d76337395b9..4446b5ab181c1 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -20,6 +20,7 @@ # limitations under the License. """Inference-only GPT-2 model compatible with HuggingFace weights.""" from collections.abc import Iterable +from itertools import islice from typing import Optional, Union import torch @@ -228,7 +229,7 @@ class GPT2Model(nn.Module): assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] - for layer in self.h[self.start_layer:self.end_layer]: + for layer in islice(self.h, self.start_layer, self.end_layer): hidden_states = layer(hidden_states) if not get_pp_group().is_last_rank: diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index 036ded530f97d..d5c2604145eed 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -21,6 +21,7 @@ # limitations under the License. """Inference-only GPTBigCode model compatible with HuggingFace weights.""" from collections.abc import Iterable +from itertools import islice from typing import Optional, Union import torch @@ -246,7 +247,7 @@ class GPTBigCodeModel(nn.Module): else: hidden_states = intermediate_tensors["hidden_states"] - for layer in self.h[self.start_layer:self.end_layer]: + for layer in islice(self.h, self.start_layer, self.end_layer): hidden_states = layer(hidden_states) if not get_pp_group().is_last_rank: diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index bd162a5e57bc1..584c7f5d8a2d1 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -19,6 +19,7 @@ # limitations under the License. """Inference-only GPT-J model compatible with HuggingFace weights.""" from collections.abc import Iterable +from itertools import islice from typing import Optional, Union import torch @@ -223,7 +224,7 @@ class GPTJModel(nn.Module): hidden_states = self.get_input_embeddings(input_ids) else: hidden_states = intermediate_tensors["hidden_states"] - for layer in self.h[self.start_layer:self.end_layer]: + for layer in islice(self.h, self.start_layer, self.end_layer): hidden_states = layer(position_ids, hidden_states) if not get_pp_group().is_last_rank: return IntermediateTensors({"hidden_states": hidden_states}) @@ -336,4 +337,4 @@ class GPTJForCausalLM(nn.Module, SupportsPP): def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) - return loader.load_weights(weights) \ No newline at end of file + return loader.load_weights(weights) diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index d418d8bb86cee..e97db188e27eb 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -19,6 +19,7 @@ # limitations under the License. """Inference-only GPT-NeoX model compatible with HuggingFace weights.""" from collections.abc import Iterable +from itertools import islice from typing import Optional, Union import torch @@ -235,7 +236,7 @@ class GPTNeoXModel(nn.Module): hidden_states = self.get_input_embeddings(input_ids) else: hidden_states = intermediate_tensors["hidden_states"] - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states = layer(position_ids, hidden_states) if not get_pp_group().is_last_rank: return IntermediateTensors({"hidden_states": hidden_states}) diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index 507a9206c4281..f8ba0229210a9 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -24,6 +24,7 @@ # limitations under the License. """Inference-only IBM Granite model compatible with HuggingFace weights.""" from collections.abc import Iterable +from itertools import islice from typing import Any, Optional, Union import torch @@ -316,7 +317,7 @@ class GraniteModel(nn.Module): hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states = layer(positions, hidden_states) if not get_pp_group().is_last_rank: diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index 7d31854dce8d8..07ad75bcf1665 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -24,6 +24,7 @@ # limitations under the License. """Inference-only GraniteMoe model.""" from collections.abc import Iterable +from itertools import islice from typing import Any, Optional import torch @@ -303,7 +304,7 @@ class GraniteMoeModel(nn.Module): assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states = layer(positions, hidden_states) if not get_pp_group().is_last_rank: return IntermediateTensors({ diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index f451e65338b78..79c6d8146ba9c 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -397,8 +397,7 @@ class GraniteMoeHybridModel(nn.Module): residual = intermediate_tensors["residual"] num_attn = 0 - for i in range(len(self.layers)): - layer = self.layers[i] + for i, layer in enumerate(self.layers): if isinstance(layer, GraniteMoeHybridAttentionDecoderLayer): num_attn += 1 diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py index 1e2e8544179c7..0b568a4b22685 100644 --- a/vllm/model_executor/models/granitemoeshared.py +++ b/vllm/model_executor/models/granitemoeshared.py @@ -6,6 +6,7 @@ The architecture is the same as granitemoe but with the addition of shared experts. """ from collections.abc import Iterable +from itertools import islice from typing import Optional import torch @@ -200,8 +201,7 @@ class GraniteMoeSharedModel(nn.Module): assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for i in range(self.start_layer, self.end_layer): - layer = self.layers[i] + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states = layer(positions, hidden_states) if not get_pp_group().is_last_rank: return IntermediateTensors({ diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py index 3659249cd8bd6..a591134383371 100644 --- a/vllm/model_executor/models/grok1.py +++ b/vllm/model_executor/models/grok1.py @@ -23,6 +23,7 @@ # limitations under the License. """Inference-only Grok1 model.""" from collections.abc import Iterable +from itertools import islice from typing import Optional, Union import torch @@ -347,8 +348,7 @@ class Grok1Model(nn.Module): hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for i in range(self.start_layer, self.end_layer): - layer = self.layers[i] + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states, residual = layer(positions, hidden_states, residual) if not get_pp_group().is_last_rank: diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 26bc48ffbd9bc..320e8d9d480c3 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -3,6 +3,7 @@ from collections.abc import Iterable from functools import partial +from itertools import islice from typing import Any, Optional, Union import torch @@ -297,7 +298,7 @@ class InternLM2Model(nn.Module): assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states, residual = layer(positions, hidden_states, residual) if not get_pp_group().is_last_rank: return IntermediateTensors({ diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py index 4bbb49da0e96f..d41ac2b70bc69 100644 --- a/vllm/model_executor/models/internlm2_ve.py +++ b/vllm/model_executor/models/internlm2_ve.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from itertools import islice from typing import Optional, Union import torch @@ -123,7 +124,7 @@ class InternLM2VEModel(InternLM2Model): assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states, residual = layer( positions, hidden_states, diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py index bed4a5dff2efa..91a06dd502474 100644 --- a/vllm/model_executor/models/jais.py +++ b/vllm/model_executor/models/jais.py @@ -23,6 +23,7 @@ import math from collections.abc import Iterable +from itertools import islice from typing import Optional, Union import torch @@ -276,7 +277,7 @@ class JAISModel(nn.Module): assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] - for layer in self.h[self.start_layer:self.end_layer]: + for layer in islice(self.h, self.start_layer, self.end_layer): hidden_states = layer(hidden_states) if not get_pp_group().is_last_rank: diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 3c1a0b68df56e..aebd2cbe2e999 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Inference-only Jamba model.""" from collections.abc import Iterable +from itertools import islice from typing import Optional import torch @@ -350,7 +351,7 @@ class JambaModel(nn.Module): kv_cache_index = 0 mamba_cache_index = 0 - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): layer_mamba_cache_params = None if isinstance(layer, JambaAttentionDecoderLayer): kv_cache_index += 1 diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py index 5f3148b47eadc..927f78c4e4b45 100644 --- a/vllm/model_executor/models/lfm2.py +++ b/vllm/model_executor/models/lfm2.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable +from itertools import islice from typing import Any, Optional import torch @@ -374,7 +375,7 @@ class Lfm2Model(nn.Module): hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states, residual = layer( positions=positions, hidden_states=hidden_states, @@ -554,4 +555,4 @@ class Lfm2ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None), ) - return loader.load_weights(weights) \ No newline at end of file + return loader.load_weights(weights) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index e39a6df843cd4..a22bde194f5de 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -24,6 +24,7 @@ # limitations under the License. """Inference-only LLaMA model compatible with HuggingFace weights.""" from collections.abc import Iterable +from itertools import islice from typing import Any, Optional, Union import torch @@ -383,7 +384,7 @@ class LlamaModel(nn.Module): aux_hidden_states = [] for idx, layer in enumerate( - self.layers[self.start_layer:self.end_layer]): + islice(self.layers, self.start_layer, self.end_layer)): if idx in self.aux_hidden_state_layers: aux_hidden_states.append(hidden_states + residual) hidden_states, residual = layer(positions, hidden_states, residual) diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py index 3432cf29feac6..81b9a125380aa 100644 --- a/vllm/model_executor/models/mamba2.py +++ b/vllm/model_executor/models/mamba2.py @@ -164,9 +164,7 @@ class Mamba2Model(nn.Module): # v1 get mamba2_metadata from forward_context mamba2_metadata = None - for i in range(len(self.layers)): - layer = self.layers[i] - + for i, layer in enumerate(self.layers): hidden_states, residual = layer( positions=positions, hidden_states=hidden_states, diff --git a/vllm/model_executor/models/mimo.py b/vllm/model_executor/models/mimo.py index 5b497dd9d89f5..ea5292d0df202 100644 --- a/vllm/model_executor/models/mimo.py +++ b/vllm/model_executor/models/mimo.py @@ -26,6 +26,7 @@ # limitations under the License. """Inference-only MiMo model compatible with HuggingFace weights.""" from collections.abc import Iterable +from itertools import islice from typing import Optional, Union import torch @@ -74,7 +75,7 @@ class MiMoModel(Qwen2Model): assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states, residual = layer( positions, hidden_states, diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index d398a5d12bbcd..5632f8c8cc4fb 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -25,6 +25,7 @@ """Inference-only MiniCPM model compatible with HuggingFace weights.""" import math from collections.abc import Iterable +from itertools import islice from typing import Any, Optional, Union import torch @@ -414,7 +415,7 @@ class MiniCPMModel(nn.Module): hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states, residual = layer( positions, hidden_states, diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index 176a40179bcac..93ef13d5d16a0 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -3,6 +3,7 @@ """Inference-only MiniMaxText01 model.""" import math from collections.abc import Iterable +from itertools import islice from typing import TYPE_CHECKING, Optional, Union if TYPE_CHECKING: @@ -1019,8 +1020,7 @@ class MiniMaxText01Model(nn.Module): minimax_cache_index = 0 - for i in range(self.start_layer, self.end_layer): - layer = self.layers[i] + for layer in islice(self.layers, self.start_layer, self.end_layer): _caches = None if not envs.VLLM_USE_V1 and isinstance( layer.self_attn, MiniMaxText01LinearAttention): diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 30de83da49e0e..52fcbbfc58be6 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -24,6 +24,7 @@ # limitations under the License. """Inference-only Mixtral model.""" from collections.abc import Iterable +from itertools import islice from typing import Optional, Union import torch @@ -307,7 +308,7 @@ class MixtralModel(nn.Module): assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states, residual = layer(positions, hidden_states, residual) if not get_pp_group().is_last_rank: return IntermediateTensors({ diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index c8ad358c622d2..692267b4d7271 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -24,6 +24,7 @@ # limitations under the License. """Inference-only Mixtral model.""" from collections.abc import Iterable +from itertools import islice from typing import Optional, Union import numpy as np @@ -346,7 +347,7 @@ class MixtralModel(nn.Module): assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states, residual = layer(positions, hidden_states, residual) if not get_pp_group().is_last_rank: return IntermediateTensors({ diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 5fc28ed0e493e..b2fc7be1af224 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -5,6 +5,7 @@ import math from collections.abc import Iterable, Mapping, Sequence from dataclasses import dataclass from functools import cached_property, partial +from itertools import islice from typing import Annotated, Optional, Union import numpy as np @@ -842,7 +843,7 @@ class MolmoModel(nn.Module, SupportsQuant): residual = intermediate_tensors["residual"] # Apply blocks one-by-one. - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states, residual = layer( positions, hidden_states, diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 8db52a69924c9..48ac91fa6dde0 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -4,6 +4,7 @@ # Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main import math from collections.abc import Iterable +from itertools import islice from typing import Optional, Union import torch @@ -260,7 +261,7 @@ class MPTModel(nn.Module): assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] - for block in self.blocks[self.start_layer:self.end_layer]: + for block in islice(self.blocks, self.start_layer, self.end_layer): hidden_states = block(position_ids, hidden_states) if not get_pp_group().is_last_rank: return IntermediateTensors({"hidden_states": hidden_states}) diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index eabf47b1aede4..10adc62d3de38 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -24,6 +24,7 @@ # limitations under the License. """Inference-only Nemotron model compatible with HuggingFace weights.""" from collections.abc import Iterable +from itertools import islice from typing import Any, Optional, Union import torch @@ -353,7 +354,7 @@ class NemotronModel(nn.Module): hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states, residual = layer(positions, hidden_states, residual) if not get_pp_group().is_last_rank: diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index 07cd5a4c6e24f..8a563288cb4d6 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -399,8 +399,7 @@ class NemotronHModel(nn.Module): residual = None num_non_mamba_layers = 0 - for i in range(len(self.layers)): - layer = self.layers[i] + for i, layer in enumerate(self.layers): layer_mamba_cache_params = None if isinstance(layer, NemotronHMambaDecoderLayer) and mamba_cache_params: diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py index a766ed9476a65..f8e38dcd80b5a 100644 --- a/vllm/model_executor/models/nemotron_nas.py +++ b/vllm/model_executor/models/nemotron_nas.py @@ -24,6 +24,7 @@ # limitations under the License. """Inference-only deci model compatible with HuggingFace weights.""" from collections.abc import Iterable +from itertools import islice from typing import Any, Optional, Union import torch @@ -287,8 +288,7 @@ class DeciModel(nn.Module): residual = intermediate_tensors["residual"] kv_cache_index = 0 - for i in range(self.start_layer, self.end_layer): - layer = self.layers[i] + for layer in islice(self.layers, self.start_layer, self.end_layer): if not layer._is_no_op_attention: hidden_states, residual = layer(positions, hidden_states, residual) diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 01639d398126f..71575989565a8 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -24,6 +24,7 @@ # limitations under the License. """Inference-only OLMo model compatible with HuggingFace weights.""" from collections.abc import Iterable +from itertools import islice from typing import Optional, Union import torch @@ -280,7 +281,7 @@ class OlmoModel(nn.Module): hidden_states = intermediate_tensors["hidden_states"] # Apply blocks one-by-one. - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): # shape: (batch_size, seq_len, d_model) hidden_states = layer(positions, hidden_states) diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index 66a0f9115585a..bccd1b87043a5 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -26,6 +26,7 @@ from collections.abc import Iterable from functools import partial +from itertools import islice from typing import Optional, Union import torch @@ -305,7 +306,7 @@ class Olmo2Model(nn.Module): assert isinstance(hidden_states, torch.Tensor) # Apply blocks one-by-one. - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): # shape: (batch_size, seq_len, d_model) hidden_states = layer(positions, hidden_states) diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index a47c3bd416459..9b8525bfadece 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -15,6 +15,7 @@ """Inference-only OLMoE model compatible with HuggingFace weights.""" from collections.abc import Iterable from functools import partial +from itertools import islice from typing import Any, Optional, Union import torch @@ -314,7 +315,7 @@ class OlmoeModel(nn.Module): hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states, residual = layer( positions, hidden_states, diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index 9eaac1e28dcd8..b92e586f0bf21 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -20,6 +20,7 @@ # limitations under the License. """Inference-only OPT model compatible with HuggingFace weights.""" from collections.abc import Iterable +from itertools import islice from typing import Optional, Union import torch @@ -269,7 +270,7 @@ class OPTDecoder(nn.Module): assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states = layer(hidden_states) if not get_pp_group().is_last_rank: diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index d121188ba5d4a..add751ebf09cc 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -7,6 +7,7 @@ # LICENSE: https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/LICENSE """Inference-only Orion-14B model compatible with HuggingFace weights.""" from collections.abc import Iterable +from itertools import islice from typing import Any, Optional, Union import torch @@ -252,7 +253,7 @@ class OrionModel(nn.Module): else: assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states = layer(positions, hidden_states) if not get_pp_group().is_last_rank: return IntermediateTensors({ diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index f8db99eb92ba8..6bdd38d068800 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -23,6 +23,7 @@ # limitations under the License. """Inference-only persimmon model compatible with HuggingFace weights.""" from collections.abc import Iterable +from itertools import islice from typing import Optional, Union import torch @@ -255,7 +256,7 @@ class PersimmonModel(nn.Module): else: assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states = layer(positions, hidden_states) if not get_pp_group().is_last_rank: return IntermediateTensors({"hidden_states": hidden_states}) diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index 21d517b3a490f..789b24eb0f6be 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -38,6 +38,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """Inference-only Phi-1.5 model compatible with HuggingFace weights.""" from collections.abc import Iterable +from itertools import islice from typing import Optional, Union import torch @@ -240,7 +241,7 @@ class PhiModel(nn.Module): else: assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states = layer(positions, hidden_states) if not get_pp_group().is_last_rank: diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index cfe0982204fa9..15ae081a9f5fc 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -24,6 +24,7 @@ # limitations under the License. """Inference-only PhiMoE model.""" from collections.abc import Iterable +from itertools import islice from typing import Optional, Union import torch @@ -500,7 +501,7 @@ class PhiMoEModel(nn.Module): hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states, residual = layer( positions, hidden_states, diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index e5034b536266a..7f70e44b10a6d 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Inference-only PLaMo2 model.""" from collections.abc import Iterable +from itertools import islice from typing import Optional import torch @@ -614,7 +615,7 @@ class Plamo2Decoder(torch.nn.Module): mamba2_metadata: Mamba2Metadata, ) -> torch.Tensor: mamba_cache_index = 0 - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): layer_mamba_cache_params = None if layer.is_mamba: layer_mamba_cache_params = mamba_cache_params.at_layer_idx( diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index e804f03e014e1..e32dc51f00c09 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -8,6 +8,7 @@ """Inference-only QWen model compatible with HuggingFace weights.""" import json from collections.abc import Iterable +from itertools import islice from typing import Any, Optional, Union import torch @@ -234,7 +235,7 @@ class QWenModel(nn.Module): hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for layer in self.h[self.start_layer:self.end_layer]: + for layer in islice(self.h, self.start_layer, self.end_layer): hidden_states, residual = layer( positions, hidden_states, diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 27c1e68c6704b..54dc0bebd9c5e 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -25,6 +25,7 @@ # limitations under the License. """Inference-only Qwen2 model compatible with HuggingFace weights.""" from collections.abc import Iterable +from itertools import islice from typing import Any, Optional, Union import torch @@ -358,7 +359,7 @@ class Qwen2Model(nn.Module): aux_hidden_states = [] for idx, layer in enumerate( - self.layers[self.start_layer:self.end_layer]): + islice(self.layers, self.start_layer, self.end_layer)): if idx in self.aux_hidden_state_layers: aux_hidden_states.append(hidden_states + residual) hidden_states, residual = layer(positions, hidden_states, residual) diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 5c4ad34246d66..5551ad8c32329 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -25,6 +25,7 @@ # limitations under the License. """Inference-only Qwen2MoE model compatible with HuggingFace weights.""" from collections.abc import Iterable +from itertools import islice from typing import Any, Optional, Union import torch @@ -381,7 +382,7 @@ class Qwen2MoeModel(nn.Module): assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states, residual = layer(positions, hidden_states, residual) if not get_pp_group().is_last_rank: return IntermediateTensors({ diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 8498f61b35fdd..94e6a66bea5cb 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -24,6 +24,7 @@ """Inference-only Qwen3MoE model compatible with HuggingFace weights.""" import typing from collections.abc import Callable, Iterable +from itertools import islice from typing import Any, Optional, Union import torch @@ -420,8 +421,7 @@ class Qwen3MoeModel(nn.Module): assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for i in range(self.start_layer, self.end_layer): - layer = self.layers[i] + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states, residual = layer(positions, hidden_states, residual) if not get_pp_group().is_last_rank: return IntermediateTensors({ diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py index 34a87a6a69a39..e3c7c700f8fa1 100644 --- a/vllm/model_executor/models/seed_oss.py +++ b/vllm/model_executor/models/seed_oss.py @@ -23,6 +23,7 @@ # limitations under the License. """Inference-only SeedOss model compatible with HuggingFace weights.""" from collections.abc import Iterable +from itertools import islice from typing import Optional, Union import torch @@ -340,7 +341,7 @@ class SeedOssModel(nn.Module): assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states, residual = layer( positions, hidden_states, diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index d6ec743ce845e..9e880ebd50813 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -22,6 +22,7 @@ """Inference-only StabeLM (https://github.com/Stability-AI/StableLM) model compatible with HuggingFace weights.""" from collections.abc import Iterable +from itertools import islice from typing import Optional, Union import torch @@ -247,7 +248,7 @@ class StableLMEpochModel(nn.Module): else: assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states, residual = layer(positions, hidden_states) if not get_pp_group().is_last_rank: return IntermediateTensors({"hidden_states": hidden_states}) diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 9d9a2bff0e43f..62ff9b6182755 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -21,6 +21,7 @@ # limitations under the License. """ PyTorch Starcoder2 model.""" from collections.abc import Iterable +from itertools import islice from typing import Optional, Union import torch @@ -250,7 +251,7 @@ class Starcoder2Model(nn.Module): else: assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] - for layer in self.layers[self.start_layer:self.end_layer]: + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states = layer(positions, hidden_states) if not get_pp_group().is_last_rank: return IntermediateTensors({"hidden_states": hidden_states}) diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py index 47d2af5c2a140..97611d3e140ec 100644 --- a/vllm/model_executor/models/step3_text.py +++ b/vllm/model_executor/models/step3_text.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Inference-only Jurassic model.""" from collections.abc import Iterable +from itertools import islice from typing import Any, Optional import torch @@ -346,8 +347,7 @@ class Step3TextModel(nn.Module): hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for i in range(self.start_layer, self.end_layer): - layer = self.layers[i] + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states, residual = layer(positions, hidden_states, residual) if not get_pp_group().is_last_rank: From 006477e60b49babfca96352c7c648f10fff4a053 Mon Sep 17 00:00:00 2001 From: Charlie Fu Date: Thu, 28 Aug 2025 21:52:27 -0500 Subject: [PATCH 12/56] [ROCm][Fix] Fix rocm build caused by #23791 (#23847) Signed-off-by: charlifu --- csrc/cache_kernels.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index fc82a1fa8ed78..fbb022464ef27 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -913,7 +913,6 @@ __global__ void cp_gather_cache( const int32_t split_end = min((split + 1) * split_slots, tot_slots); const bool is_active_split = (split_start < tot_slots); - const bool is_last_split = (split_end == tot_slots); if (!is_active_split) return; From c8b3b299c9f3142546e0a41f835e561af1aaffb7 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Fri, 29 Aug 2025 00:25:33 -0400 Subject: [PATCH 13/56] [tests] Improve speed and reliability of test_transcription_api_correctness (#23854) Signed-off-by: Russell Bryant --- .../correctness/test_transcription_api_correctness.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py index 58195f98bd351..0d0ce0be8c5f8 100644 --- a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py +++ b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py @@ -49,8 +49,7 @@ async def transcribe_audio(client, tokenizer, y, sr): return latency, num_output_tokens, transcription.text -async def bound_transcribe(model_name, sem, client, audio, reference): - tokenizer = AutoTokenizer.from_pretrained(model_name) +async def bound_transcribe(sem, client, tokenizer, audio, reference): # Use semaphore to limit concurrent requests. async with sem: result = await transcribe_audio(client, tokenizer, *audio) @@ -63,15 +62,19 @@ async def bound_transcribe(model_name, sem, client, audio, reference): async def process_dataset(model, client, data, concurrent_request): sem = asyncio.Semaphore(concurrent_request) + # Load tokenizer once outside the loop + tokenizer = AutoTokenizer.from_pretrained(model) + # Warmup call as the first `librosa.load` server-side is quite slow. audio, sr = data[0]["audio"]["array"], data[0]["audio"]["sampling_rate"] - _ = await bound_transcribe(model, sem, client, (audio, sr), "") + _ = await bound_transcribe(sem, client, tokenizer, (audio, sr), "") tasks: list[asyncio.Task] = [] for sample in data: audio, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"] task = asyncio.create_task( - bound_transcribe(model, sem, client, (audio, sr), sample["text"])) + bound_transcribe(sem, client, tokenizer, (audio, sr), + sample["text"])) tasks.append(task) return await asyncio.gather(*tasks) From 98ac0cb32d9462e50bd998f9f2eb6e4c09232c95 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Fri, 29 Aug 2025 12:41:20 +0800 Subject: [PATCH 14/56] [Bugfix] Use `ReplicatedLinear` for SequenceClassification head (#23836) Signed-off-by: Isotr0py --- tests/models/language/pooling/test_qwen3_reranker.py | 7 ++----- vllm/model_executor/models/adapters.py | 5 ++--- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/tests/models/language/pooling/test_qwen3_reranker.py b/tests/models/language/pooling/test_qwen3_reranker.py index 8c6537f3193f8..5dd2d9eae9115 100644 --- a/tests/models/language/pooling/test_qwen3_reranker.py +++ b/tests/models/language/pooling/test_qwen3_reranker.py @@ -96,8 +96,5 @@ def test_rerank_models_mteb_tp(vllm_runner, "tensor_parallel_size": 2, } - mteb_test_rerank_models(Qwen3RerankerHfRunner, - vllm_runner, - model_info, - vllm_extra_kwargs, - atol=1.2e-2) + mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info, + vllm_extra_kwargs) diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index 49e9a2d65ea11..50c2cd97f3d09 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -248,7 +248,7 @@ def as_seq_cls_model(cls: _T) -> _T: return cls # Lazy import - from vllm.model_executor.layers.linear import RowParallelLinear + from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.pooler import (ClassifierPooler, DispatchPooler, Pooler, PoolingMethod, PoolingType) @@ -264,10 +264,9 @@ def as_seq_cls_model(cls: _T) -> _T: config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config - self.score = RowParallelLinear( + self.score = ReplicatedLinear( config.hidden_size, config.num_labels, - input_is_parallel=False, bias=False, params_dtype=torch.float32, quant_config=quant_config, From 5264015d74f2e0213a1e7d51041a558d7ea580e8 Mon Sep 17 00:00:00 2001 From: Jinghui Zhang Date: Thu, 28 Aug 2025 22:54:12 -0700 Subject: [PATCH 15/56] [BugFix][AMD][Deepseek] fix a dtype mismatch error for deepseek running on AMD (#23864) Signed-off-by: Jinghui Zhang --- .../layers/fused_moe/rocm_aiter_fused_moe.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py index 93e20c3477bbe..b838fd798bbc0 100644 --- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py @@ -279,7 +279,7 @@ def rocm_aiter_grouped_topk( if e_score_correction_bias is not None: torch.ops.vllm.rocm_aiter_biased_grouped_topk( gating_output, - e_score_correction_bias, + e_score_correction_bias.to(gating_output.dtype), topk_weights, topk_ids, num_expert_group, @@ -409,15 +409,15 @@ def shuffle_weights( *tensors: torch.Tensor, layout: tuple[int, int] = (16, 16) ) -> tuple[torch.Tensor, ...]: """ - Applies shuffle_weight function from AITER to each + Applies shuffle_weight function from AITER to each input tensor and returns them. - + Rearranges (shuffles) the input tensor/s into a specified block layout for optimized computation. Args: *tensors: Variable number of torch.Tensor objects. - layout: A pair of integers specifying the + layout: A pair of integers specifying the block sizes used to divide the tensors during shuffling. Default is (16, 16). From 6597d7a4566d344835f5a90621397d8fee490b10 Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Fri, 29 Aug 2025 13:54:16 +0800 Subject: [PATCH 16/56] [Platform] import activation_quant_fusion for CUDA only (#23882) Signed-off-by: wangxiyuan --- vllm/compilation/pass_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py index e07e52be9fdf6..1b1cbe4fa12c2 100644 --- a/vllm/compilation/pass_manager.py +++ b/vllm/compilation/pass_manager.py @@ -8,13 +8,13 @@ from vllm.logger import init_logger from vllm.platforms import current_platform if current_platform.is_cuda_alike(): + from .activation_quant_fusion import ActivationQuantFusionPass from .fusion import FusionPass from .fusion_attn import AttnFusionPass if current_platform.is_cuda(): from .collective_fusion import AllReduceFusionPass, AsyncTPPass -from .activation_quant_fusion import ActivationQuantFusionPass from .fix_functionalization import FixFunctionalizationPass from .inductor_pass import CustomGraphPass, InductorPass, get_pass_context from .noop_elimination import NoOpEliminationPass From 05d839c19e9582d62c860686678bac68240d7254 Mon Sep 17 00:00:00 2001 From: Raghavan Date: Fri, 29 Aug 2025 11:25:06 +0530 Subject: [PATCH 17/56] Fix(async): Add support for truncate_prompt_tokens in AsyncLLM (#23800) --- vllm/v1/engine/async_llm.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 7440fe1f07e91..2a9fa1fd9172c 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -15,6 +15,7 @@ import vllm.envs as envs from vllm.config import ModelConfig, VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.protocol import EngineClient +from vllm.entrypoints.utils import _validate_truncation_size from vllm.envs import VLLM_V1_OUTPUT_PROC_CHUNK_SIZE from vllm.inputs import PromptType from vllm.inputs.preprocess import InputPreprocessor @@ -348,6 +349,15 @@ class AsyncLLM(EngineClient): # to handle startup failure gracefully in the OpenAI server. self._run_output_handler() + tokenization_kwargs: dict[str, Any] = {} + truncate_prompt_tokens = sampling_params.truncate_prompt_tokens + + _validate_truncation_size( + self.model_config.max_model_len, + truncate_prompt_tokens, + tokenization_kwargs, + ) + q = await self.add_request( request_id, prompt, @@ -355,6 +365,7 @@ class AsyncLLM(EngineClient): lora_request=lora_request, trace_headers=trace_headers, priority=priority, + tokenization_kwargs=tokenization_kwargs, data_parallel_rank=data_parallel_rank, ) @@ -481,6 +492,7 @@ class AsyncLLM(EngineClient): lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, priority: int = 0, + truncate_prompt_tokens: Optional[int] = None, tokenization_kwargs: Optional[dict[str, Any]] = None, ) -> AsyncGenerator[PoolingRequestOutput, None]: """ @@ -503,6 +515,14 @@ class AsyncLLM(EngineClient): # to handle startup failure gracefully in the OpenAI server. self._run_output_handler() + if tokenization_kwargs is None: + tokenization_kwargs = dict[str, Any]() + _validate_truncation_size( + self.model_config.max_model_len, + truncate_prompt_tokens, + tokenization_kwargs, + ) + q = await self.add_request( request_id, prompt, From b4f9e9631c84c73cbf05f18402074be1abf0471d Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Fri, 29 Aug 2025 14:28:35 +0800 Subject: [PATCH 18/56] [CI/Build] Clean up LoRA test (#23890) Signed-off-by: Jee Jee Li --- .../scripts/hardware_ci/run-amd-test.sh | 1 - .buildkite/test-pipeline.yaml | 9 +-- .../llm/test_generate_multiple_loras.py | 80 ------------------- ...ith_tp.py => test_llm_with_multi_loras.py} | 37 ++++++++- 4 files changed, 40 insertions(+), 87 deletions(-) delete mode 100644 tests/entrypoints/llm/test_generate_multiple_loras.py rename tests/lora/{test_multi_loras_with_tp.py => test_llm_with_multi_loras.py} (80%) diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index df0bae0c9cbff..c395011a24485 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -164,7 +164,6 @@ if [[ $commands == *" entrypoints/llm "* ]]; then --ignore=entrypoints/llm/test_chat.py \ --ignore=entrypoints/llm/test_accuracy.py \ --ignore=entrypoints/llm/test_init.py \ - --ignore=entrypoints/llm/test_generate_multiple_loras.py \ --ignore=entrypoints/llm/test_prompt_validation.py "} fi diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 454aaca0a1121..f2652045526b2 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -109,10 +109,9 @@ steps: - tests/entrypoints/offline_mode commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_collective_rpc.py + - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - label: Entrypoints Test (API Server) # 40min @@ -326,7 +325,7 @@ steps: source_file_dependencies: - vllm/lora - tests/lora - command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py + command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py parallelism: 4 - label: PyTorch Compilation Unit Tests @@ -807,13 +806,13 @@ steps: # requires multi-GPU testing for validation. - pytest -v -s -x lora/test_chatglm3_tp.py - pytest -v -s -x lora/test_llama_tp.py - - pytest -v -s -x lora/test_multi_loras_with_tp.py + - pytest -v -s -x lora/test_llm_with_multi_loras.py - label: Weight Loading Multiple GPU Test # 33min mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" - num_gpus: 2 + num_gpus: 2 optional: true source_file_dependencies: - vllm/ diff --git a/tests/entrypoints/llm/test_generate_multiple_loras.py b/tests/entrypoints/llm/test_generate_multiple_loras.py deleted file mode 100644 index a04f195692e9b..0000000000000 --- a/tests/entrypoints/llm/test_generate_multiple_loras.py +++ /dev/null @@ -1,80 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import weakref - -import pytest -# downloading lora to test lora requests -from huggingface_hub import snapshot_download - -from vllm import LLM -from vllm.distributed import cleanup_dist_env_and_memory -from vllm.lora.request import LoRARequest - -MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" - -PROMPTS = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", -] - -LORA_NAME = "typeof/zephyr-7b-beta-lora" - - -@pytest.fixture(scope="module") -def monkeypatch_module(): - from _pytest.monkeypatch import MonkeyPatch - mpatch = MonkeyPatch() - yield mpatch - mpatch.undo() - - -@pytest.fixture(scope="module", params=[False, True]) -def llm(request, monkeypatch_module): - - use_v1 = request.param - monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0') - - # pytest caches the fixture so we use weakref.proxy to - # enable garbage collection - llm = LLM(model=MODEL_NAME, - tensor_parallel_size=1, - max_model_len=8192, - enable_lora=True, - max_loras=4, - max_lora_rank=64, - max_num_seqs=128, - enforce_eager=True) - - yield weakref.proxy(llm) - - del llm - - cleanup_dist_env_and_memory() - - -@pytest.fixture(scope="module") -def zephyr_lora_files(): - return snapshot_download(repo_id=LORA_NAME) - - -@pytest.mark.skip_global_cleanup -def test_multiple_lora_requests(llm: LLM, zephyr_lora_files): - lora_request = [ - LoRARequest(LORA_NAME + str(idx), idx + 1, zephyr_lora_files) - for idx in range(len(PROMPTS)) - ] - # Multiple SamplingParams should be matched with each prompt - outputs = llm.generate(PROMPTS, lora_request=lora_request) - assert len(PROMPTS) == len(outputs) - - # Exception raised, if the size of params does not match the size of prompts - with pytest.raises(ValueError): - outputs = llm.generate(PROMPTS, lora_request=lora_request[:1]) - - # Single LoRARequest should be applied to every prompt - single_lora_request = lora_request[0] - outputs = llm.generate(PROMPTS, lora_request=single_lora_request) - assert len(PROMPTS) == len(outputs) diff --git a/tests/lora/test_multi_loras_with_tp.py b/tests/lora/test_llm_with_multi_loras.py similarity index 80% rename from tests/lora/test_multi_loras_with_tp.py rename to tests/lora/test_llm_with_multi_loras.py index fe9bd3f269515..3d8dd512a2019 100644 --- a/tests/lora/test_multi_loras_with_tp.py +++ b/tests/lora/test_llm_with_multi_loras.py @@ -1,8 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ -Script to test multi loras service with tp >= 2 +This script contains: +1. test multi loras service with tp >= 2 +2. test multi loras request """ +import pytest + from tests.utils import multi_gpu_test from vllm import LLM, SamplingParams from vllm.lora.request import LoRARequest @@ -156,3 +160,34 @@ def test_multi_loras_with_tp_sync(): output_text = call_llm_get_outputs(prompt, "Alice") check_outputs(output_text, expected_output) + + +def test_multiple_lora_requests(): + llm = LLM( + model=MODEL_PATH, + enable_lora=True, + max_loras=4, + max_lora_rank=LORA_RANK, + max_model_len=512, + gpu_memory_utilization=0.5, + enforce_eager=True, + ) + PROMPTS = ["Hello, my name is"] * 2 + LORA_NAME = "Alice" + lora_request = [ + LoRARequest(LORA_NAME + str(idx), idx + 1, + LORA_NAME_PATH_MAP[LORA_NAME]) + for idx in range(len(PROMPTS)) + ] + # Multiple SamplingParams should be matched with each prompt + outputs = llm.generate(PROMPTS, lora_request=lora_request) + assert len(PROMPTS) == len(outputs) + + # Exception raised, if the size of params does not match the size of prompts + with pytest.raises(ValueError): + outputs = llm.generate(PROMPTS, lora_request=lora_request[:1]) + + # Single LoRARequest should be applied to every prompt + single_lora_request = lora_request[0] + outputs = llm.generate(PROMPTS, lora_request=single_lora_request) + assert len(PROMPTS) == len(outputs) From 2d0afcc9dc925928ee8764c826a3661e487f9f82 Mon Sep 17 00:00:00 2001 From: Chenheli Hua Date: Thu, 28 Aug 2025 23:29:13 -0700 Subject: [PATCH 19/56] [mrope][Qwen2-VL] Fix edge case where getting index of image/video token can potentially throw in default vl mrope implementation. (#23895) Signed-off-by: Chenheli Hua --- .../layers/rotary_embedding/mrope.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py index e374aa9bebf9e..5686ec7b35de8 100644 --- a/vllm/model_executor/layers/rotary_embedding/mrope.py +++ b/vllm/model_executor/layers/rotary_embedding/mrope.py @@ -670,12 +670,18 @@ class MRotaryEmbedding(RotaryEmbedding): image_index, video_index = 0, 0 for _ in range(image_nums + video_nums): video_second_per_grid_t = 0.0 - if image_token_id in input_tokens and remain_images > 0: - ed_image = input_tokens.index(image_token_id, st) + if remain_images > 0: + try: + ed_image = input_tokens.index(image_token_id, st) + except ValueError: + ed_image = len(input_tokens) + 1 else: ed_image = len(input_tokens) + 1 - if video_token_id in input_tokens and remain_videos > 0: - ed_video = input_tokens.index(video_token_id, st) + if remain_videos > 0: + try: + ed_video = input_tokens.index(video_token_id, st) + except ValueError: + ed_video = len(input_tokens) + 1 else: ed_video = len(input_tokens) + 1 if ed_image < ed_video: From 885ca6d31db8816ee08e3fa634fbb58add289898 Mon Sep 17 00:00:00 2001 From: Jiangyun Zhu Date: Fri, 29 Aug 2025 14:58:48 +0800 Subject: [PATCH 20/56] [Misc] Fix warnings for mistral model (#23552) Signed-off-by: zjy0516 Signed-off-by: Jiangyun Zhu Co-authored-by: Patrick von Platen --- vllm/model_executor/models/pixtral.py | 12 ++++---- vllm/model_executor/models/voxtral.py | 12 ++++---- vllm/transformers_utils/tokenizers/mistral.py | 30 +++++++++++-------- 3 files changed, 31 insertions(+), 23 deletions(-) diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index a74e01a59697e..e7f5799a80067 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -15,7 +15,7 @@ from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk, from mistral_common.protocol.instruct.request import ChatCompletionRequest from mistral_common.tokens.tokenizers.multimodal import ImageEncoder from PIL import Image -from transformers import PixtralVisionConfig, TensorType +from transformers import BatchFeature, PixtralVisionConfig, TensorType from transformers.image_utils import ImageInput from transformers.models.pixtral.image_processing_pixtral import ( _num_image_tokens as _get_pixtral_hf_num_image_tokens) @@ -163,10 +163,12 @@ class PixtralProcessorAdapter: images_processed.append(image_processed) images_tokens.append(image_tokens) - return { - "input_ids": torch.cat(images_tokens)[None].expand(len(text), -1), - "images": images_processed, - } + return BatchFeature({ + "input_ids": + torch.cat(images_tokens)[None].expand(len(text), -1), + "images": + images_processed, + }) class PixtralProcessingInfo(BaseProcessingInfo): diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index eed8d89ca4f5a..6bc748407a7d1 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -17,7 +17,7 @@ from mistral_common.protocol.instruct.messages import (AudioChunk, RawAudio, from mistral_common.protocol.instruct.request import ChatCompletionRequest from mistral_common.protocol.transcription.request import TranscriptionRequest from mistral_common.tokens.tokenizers.audio import Audio, AudioEncoder -from transformers import TensorType, WhisperConfig +from transformers import BatchFeature, TensorType, WhisperConfig from transformers.tokenization_utils_base import TextInput from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig @@ -156,10 +156,12 @@ class VoxtralProcessorAdapter: audios_tokens.append(torch.tensor(audio_tokens)) audios_processed.append(torch.tensor(audio)) - return { - "input_ids": torch.cat(audios_tokens)[None].expand(len(text), -1), - "audio_arrays": audios_processed, - } + return BatchFeature({ + "input_ids": + torch.cat(audios_tokens)[None].expand(len(text), -1), + "audio_arrays": + audios_processed, + }) class VoxtralProcessingInfo(BaseProcessingInfo): diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 4dd8b2439b3f5..f545993a5a980 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -204,18 +204,16 @@ class MistralTokenizer(TokenizerBase): self.version: int = int(_mistral_version_str.split("v")[-1]) tokenizer_ = tokenizer.instruct_tokenizer.tokenizer - from mistral_common.tokens.tokenizers.tekken import ( - SpecialTokenPolicy, Tekkenizer) + from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy + from mistral_common.tokens.tokenizers.tekken import Tekkenizer + self.is_tekken = isinstance(tokenizer_, Tekkenizer) from mistral_common.tokens.tokenizers.sentencepiece import ( SentencePieceTokenizer) self.is_spm = isinstance(tokenizer_, SentencePieceTokenizer) - if self.is_tekken: - # Make sure special tokens will not raise - tokenizer_.special_token_policy = SpecialTokenPolicy.IGNORE - elif self.is_spm: - pass - else: + self._special_token_policy = (SpecialTokenPolicy.IGNORE + if self.is_tekken else None) + if not (self.is_tekken or self.is_spm): raise TypeError(f"Unsupported tokenizer: {type(tokenizer_)}") self._vocab = tokenizer_.vocab() @@ -430,7 +428,8 @@ class MistralTokenizer(TokenizerBase): return self.tokenizer.unk_id ids = [_token_to_id(t) for t in tokens] - decoded = self.tokenizer.decode(ids) + decoded = self.tokenizer.decode(ids, + self._special_token_policy) else: decoded = "".join(tokens) else: @@ -444,7 +443,8 @@ class MistralTokenizer(TokenizerBase): if token in special_tokens: if regular_tokens: decoded_list.append( - self.tokenizer.decode(regular_tokens)) + self.tokenizer.decode(regular_tokens, + self._special_token_policy)) regular_tokens = [] decoded_list.append(token) else: @@ -452,7 +452,8 @@ class MistralTokenizer(TokenizerBase): if regular_tokens: decoded_list.append( - self.tokenizer.decode(regular_tokens)) # type: ignore + self.tokenizer.decode(regular_tokens, + self._special_token_policy)) decoded = ''.join(decoded_list) @@ -470,7 +471,7 @@ class MistralTokenizer(TokenizerBase): if isinstance(ids, int): ids = [ids] - return self.tokenizer.decode(ids) + return self.tokenizer.decode(ids, self._special_token_policy) def convert_ids_to_tokens( self, @@ -511,6 +512,9 @@ class MistralTokenizer(TokenizerBase): # See: https://github.com/vllm-project/vllm/pull/8640 # https://github.com/vllm-project/vllm/pull/9625 # if underlying tokenizeir is sentencepiece, we just add "�" - tokens = [self.tokenizer.id_to_byte_piece(id) for id in ids] + tokens = [ + self.tokenizer.id_to_byte_piece(id, self._special_token_policy) + for id in ids + ] return tokens From 934bebf19252da6e1f2583d92e31d583b49498a2 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 29 Aug 2025 08:01:40 +0100 Subject: [PATCH 21/56] Better errors for Transformers backend missing features (#23759) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/transformers.py | 25 ++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index f7ced6134da52..5ad0482330ecd 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -17,6 +17,7 @@ """Wrapper around `transformers` models""" from collections.abc import Iterable, Mapping from contextlib import contextmanager +from pathlib import Path from typing import Literal, Optional, Union import regex as re @@ -60,6 +61,21 @@ from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper, logger = init_logger(__name__) +def get_feature_request_tip( + model: str, + trust_remote_code: bool, +) -> str: + hf_url = f"a discussion at https://huggingface.co/{model}/discussions/new" + gh_url = "an issue at https://github.com/huggingface/transformers/issues/new/choose" + url = hf_url if trust_remote_code else gh_url + prefix = f"Please open {url} to request support for this feature. " + if Path(model).exists(): + prefix = "" + doc_url = "https://docs.vllm.ai/en/latest/models/supported_models.html#writing-custom-models" + tip = f"See {doc_url} for instructions on how to add support yourself." + return f"{prefix}{tip}" + + def vllm_flash_attention_forward( # Transformers args module: torch.nn.Module, @@ -480,8 +496,11 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP): return if not self.model.supports_pp_plan: + tip = get_feature_request_tip(self.model_config.model, + self.model_config.trust_remote_code) raise ValueError( - f"{type(self.model)} does not support pipeline parallel yet!") + f"{type(self.model)} does not support pipeline parallel. {tip}" + ) module_lists = [] module_list_idx = None @@ -535,8 +554,10 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP): models_with_tp_plan = filter(supports_tp_plan, pretrained_models) if not any(models_with_tp_plan) and self.tp_size > 1: + tip = get_feature_request_tip(self.model_config.model, + self.model_config.trust_remote_code) raise ValueError( - f"{type(self.model)} does not support tensor parallel yet!") + f"{type(self.model)} does not support tensor parallel. {tip}") def _tensor_parallel(module: nn.Module, prefix: str = "", From 2554b27baa58b15843367f92d7f73d71bb89033d Mon Sep 17 00:00:00 2001 From: Maximilien de Bayser Date: Fri, 29 Aug 2025 04:04:02 -0300 Subject: [PATCH 22/56] [V0 Deprecation] Remove pooling model support in V0 (#23434) Signed-off-by: Woosuk Kwon Signed-off-by: Max de Bayser Co-authored-by: Woosuk Kwon --- tests/distributed/test_pipeline_parallel.py | 8 +- tests/entrypoints/llm/test_classify.py | 8 - tests/entrypoints/llm/test_encode.py | 8 - tests/entrypoints/llm/test_reward.py | 8 - tests/entrypoints/llm/test_score.py | 8 - .../offline_mode/test_offline_mode.py | 19 +- tests/entrypoints/openai/test_embedding.py | 8 - tests/entrypoints/openai/test_rerank.py | 8 - tests/entrypoints/openai/test_score.py | 9 - .../models/language/pooling/test_embedding.py | 20 +- tests/models/language/pooling/test_reward.py | 8 - tests/models/language/pooling/test_scoring.py | 9 - tests/models/registry.py | 23 +- tests/worker/test_model_input.py | 54 ----- vllm/core/scheduler.py | 1 - vllm/engine/arg_utils.py | 7 +- vllm/engine/async_llm_engine.py | 111 ++------- vllm/engine/llm_engine.py | 82 ++----- vllm/engine/multiprocessing/__init__.py | 1 + vllm/engine/multiprocessing/client.py | 51 +--- vllm/engine/multiprocessing/engine.py | 5 +- vllm/engine/protocol.py | 5 +- vllm/entrypoints/llm.py | 3 +- vllm/entrypoints/openai/serving_score.py | 4 +- vllm/inputs/data.py | 6 - vllm/inputs/preprocess.py | 12 +- vllm/model_executor/layers/pooler.py | 32 +-- vllm/model_executor/models/bert.py | 2 +- vllm/model_executor/models/gritlm.py | 2 +- vllm/model_executor/models/modernbert.py | 2 +- vllm/model_executor/pooling_metadata.py | 90 ------- vllm/multimodal/inputs.py | 6 - vllm/sequence.py | 11 - vllm/worker/enc_dec_model_runner.py | 5 +- vllm/worker/model_runner.py | 29 --- vllm/worker/model_runner_base.py | 14 +- vllm/worker/pooling_model_runner.py | 222 ------------------ vllm/worker/worker.py | 6 +- 38 files changed, 99 insertions(+), 808 deletions(-) delete mode 100644 vllm/model_executor/pooling_metadata.py delete mode 100644 vllm/worker/pooling_model_runner.py diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 28150d7682378..1afe9ea970c97 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -118,6 +118,8 @@ class PPTestSettings: multi_node_only: bool = False, load_format: Optional[str] = None, ): + vllm_major_versions = ["1"] if runner == "pooling" else ["0"] + return PPTestSettings( parallel_setups=[ ParallelSetup(tp_size=tp_base, @@ -126,7 +128,7 @@ class PPTestSettings: chunked_prefill=False), ], distributed_backends=["mp"], - vllm_major_versions=["0"], + vllm_major_versions=vllm_major_versions, runner=runner, test_options=PPTestOptions(multi_node_only=multi_node_only, load_format=load_format), @@ -213,7 +215,9 @@ TEXT_GENERATION_MODELS = { EMBEDDING_MODELS = { # type: ignore[var-annotated] # [Text-only] "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(runner="pooling"), - "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(runner="pooling"), + # TODO: re-enable when https://github.com/vllm-project/vllm/issues/23883 + # is fixed + #"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(runner="pooling"), "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast( load_format="dummy", runner="pooling" ), diff --git a/tests/entrypoints/llm/test_classify.py b/tests/entrypoints/llm/test_classify.py index 57705ff669075..7c261a2a5794e 100644 --- a/tests/entrypoints/llm/test_classify.py +++ b/tests/entrypoints/llm/test_classify.py @@ -16,14 +16,6 @@ MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach" prompts = ["The chef prepared a delicious meal."] -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - @pytest.fixture(scope="module") def llm(): # pytest caches the fixture so we use weakref.proxy to diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py index cb54b16b0b044..eae3e234378f2 100644 --- a/tests/entrypoints/llm/test_encode.py +++ b/tests/entrypoints/llm/test_encode.py @@ -27,14 +27,6 @@ TOKEN_IDS = [ ] -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - @pytest.fixture(scope="module") def llm(): # pytest caches the fixture so we use weakref.proxy to diff --git a/tests/entrypoints/llm/test_reward.py b/tests/entrypoints/llm/test_reward.py index de82cf8d40380..2cee3c8d94e36 100644 --- a/tests/entrypoints/llm/test_reward.py +++ b/tests/entrypoints/llm/test_reward.py @@ -16,14 +16,6 @@ MODEL_NAME = "internlm/internlm2-1_8b-reward" prompts = ["The chef prepared a delicious meal."] -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - @pytest.fixture(scope="module") def llm(): # pytest caches the fixture so we use weakref.proxy to diff --git a/tests/entrypoints/llm/test_score.py b/tests/entrypoints/llm/test_score.py index 5a1339b2addf4..f715dacacb8ff 100644 --- a/tests/entrypoints/llm/test_score.py +++ b/tests/entrypoints/llm/test_score.py @@ -14,14 +14,6 @@ from ...models.utils import softmax MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls" -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - @pytest.fixture(scope="module") def llm(): # pytest caches the fixture so we use weakref.proxy to diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py index dd8d63ad319ac..a154bb1059aae 100644 --- a/tests/entrypoints/offline_mode/test_offline_mode.py +++ b/tests/entrypoints/offline_mode/test_offline_mode.py @@ -32,15 +32,16 @@ MODEL_CONFIGS = [ "tensor_parallel_size": 1, "tokenizer_mode": "mistral", }, - { - "model": "sentence-transformers/all-MiniLM-L12-v2", - "enforce_eager": True, - "gpu_memory_utilization": 0.20, - "max_model_len": 64, - "max_num_batched_tokens": 64, - "max_num_seqs": 64, - "tensor_parallel_size": 1, - }, + # TODO: re-enable once these tests are run with V1 + # { + # "model": "sentence-transformers/all-MiniLM-L12-v2", + # "enforce_eager": True, + # "gpu_memory_utilization": 0.20, + # "max_model_len": 64, + # "max_num_batched_tokens": 64, + # "max_num_seqs": 64, + # "tensor_parallel_size": 1, + # }, ] diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py index cf2442a569388..d46ab304ba6d5 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/openai/test_embedding.py @@ -24,14 +24,6 @@ DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + DTYPE = "bfloat16" -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - @pytest.fixture(scope="module") def server(): args = [ diff --git a/tests/entrypoints/openai/test_rerank.py b/tests/entrypoints/openai/test_rerank.py index 73364294cbcdc..ce4d6c5f5d337 100644 --- a/tests/entrypoints/openai/test_rerank.py +++ b/tests/entrypoints/openai/test_rerank.py @@ -14,14 +14,6 @@ MODEL_NAME = "BAAI/bge-reranker-base" DTYPE = "bfloat16" -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - @pytest.fixture(scope="module") def server(): args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE] diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py index cb6ec795ae969..4fafcfb45fa22 100644 --- a/tests/entrypoints/openai/test_score.py +++ b/tests/entrypoints/openai/test_score.py @@ -12,15 +12,6 @@ from vllm.entrypoints.openai.protocol import ScoreResponse from ...utils import RemoteOpenAIServer - -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - MODELS = [ { "name": "BAAI/bge-reranker-v2-m3", diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py index 2dd35c4151580..f918b2b91bcc3 100644 --- a/tests/models/language/pooling/test_embedding.py +++ b/tests/models/language/pooling/test_embedding.py @@ -10,14 +10,6 @@ from vllm.platforms import current_platform from ...utils import check_embeddings_close, check_transformers_version -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - @pytest.mark.parametrize( "model", [ @@ -32,21 +24,15 @@ def v1(run_with_both_engines): "intfloat/e5-mistral-7b-instruct", # CPU v1 doesn't support sliding window marks=[pytest.mark.core_model]), - # the qwen models interfere with each other (see PR - # https://github.com/vllm-project/vllm/pull/18720). - # To avoid this problem, for now we skip v0 since it will be - # deprecated anyway. pytest.param("ssmits/Qwen2-7B-Instruct-embed-base", - marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]), + marks=[pytest.mark.cpu_model]), # [Encoder-only] pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]), pytest.param("sentence-transformers/all-MiniLM-L12-v2"), pytest.param("intfloat/multilingual-e5-small"), - pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct", - marks=[pytest.mark.skip_v1]), + pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"), # [Cross-Encoder] - pytest.param("sentence-transformers/stsb-roberta-base-v2", - marks=[pytest.mark.skip_v1]), + pytest.param("sentence-transformers/stsb-roberta-base-v2"), ], ) def test_models( diff --git a/tests/models/language/pooling/test_reward.py b/tests/models/language/pooling/test_reward.py index beafa0aed9862..08722ac98b7ed 100644 --- a/tests/models/language/pooling/test_reward.py +++ b/tests/models/language/pooling/test_reward.py @@ -13,14 +13,6 @@ from ....conftest import HfRunner from ...utils import check_transformers_version -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - @pytest.fixture def math_step_prompts(): # ruff: noqa: E501 diff --git a/tests/models/language/pooling/test_scoring.py b/tests/models/language/pooling/test_scoring.py index 6b5ff70681459..ef9d5530cde15 100644 --- a/tests/models/language/pooling/test_scoring.py +++ b/tests/models/language/pooling/test_scoring.py @@ -23,15 +23,6 @@ TEXTS_2 = [ "The capital of Germany is Berlin.", ] - -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - DTYPE = "half" diff --git a/tests/models/registry.py b/tests/models/registry.py index 85b4c96e3b1c3..13eb4872e7d84 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -323,8 +323,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { _EMBEDDING_EXAMPLE_MODELS = { # [Text-only] - "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5", v0_only=True), - "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2", v0_only=True), # noqa: E501 + "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"), + "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"), # noqa: E501 "GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"), "GteModel": _HfExamplesInfo("Snowflake/snowflake-arctic-embed-m-v2.0", trust_remote_code=True), @@ -337,9 +337,9 @@ _EMBEDDING_EXAMPLE_MODELS = { "LlamaModel": _HfExamplesInfo("llama", is_available_online=False), "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"), "ModernBertModel": _HfExamplesInfo("Alibaba-NLP/gte-modernbert-base", - trust_remote_code=True, v0_only=True), + trust_remote_code=True), "NomicBertModel": _HfExamplesInfo("nomic-ai/nomic-embed-text-v2-moe", - trust_remote_code=True, v0_only=True), # noqa: E501 + trust_remote_code=True), # noqa: E501 "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"), "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B", max_transformers_version="4.53", @@ -347,9 +347,9 @@ _EMBEDDING_EXAMPLE_MODELS = { "Qwen2ForProcessRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-PRM-7B", max_transformers_version="4.53", transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers"), # noqa: E501 - "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2", v0_only=True), # noqa: E501 - "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1", v0_only=True), # noqa: E501 - "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-small", v0_only=True), # noqa: E501 + "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2"), # noqa: E501 + "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1"), # noqa: E501 + "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-small"), # noqa: E501 # [Multimodal] "LlavaNextForConditionalGeneration": _HfExamplesInfo("royokong/e5-v"), "Phi3VForCausalLM": _HfExamplesInfo("TIGER-Lab/VLM2Vec-Full", @@ -364,20 +364,19 @@ _SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = { "GPT2ForSequenceClassification": _HfExamplesInfo("nie3e/sentiment-polish-gpt2-small"), # noqa: E501 # [Cross-encoder] - "BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2", v0_only=True), # noqa: E501 + "BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2"), # noqa: E501 "GteNewForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-multilingual-reranker-base", # noqa: E501 trust_remote_code=True, hf_overrides={ "architectures": ["GteNewForSequenceClassification"]}),# noqa: E501 - "ModernBertForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-reranker-modernbert-base", v0_only=True), # noqa: E501 - "RobertaForSequenceClassification": _HfExamplesInfo("cross-encoder/quora-roberta-base", v0_only=True), # noqa: E501 - "XLMRobertaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-m3", v0_only=True), # noqa: E501 + "ModernBertForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-reranker-modernbert-base"), # noqa: E501 + "RobertaForSequenceClassification": _HfExamplesInfo("cross-encoder/quora-roberta-base"), # noqa: E501 + "XLMRobertaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-m3"), # noqa: E501 } _AUTOMATIC_CONVERTED_MODELS = { # Use as_seq_cls_model for automatic conversion "GemmaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-gemma", # noqa: E501 - v0_only=True, hf_overrides={"architectures": ["GemmaForSequenceClassification"], # noqa: E501 "classifier_from_token": ["Yes"], # noqa: E501 "method": "no_post_processing"}), # noqa: E501 diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py index 2031f41fab87d..0f28ef2ba857b 100644 --- a/tests/worker/test_model_input.py +++ b/tests/worker/test_model_input.py @@ -9,10 +9,7 @@ from vllm.attention import AttentionMetadata, AttentionMetadataBuilder from vllm.attention.backends.abstract import AttentionBackend from vllm.attention.backends.utils import CommonAttentionState from vllm.model_executor import SamplingMetadata -from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata -from vllm.worker.pooling_model_runner import ( - ModelInputForGPUWithPoolingMetadata) class MockAttentionBackend(AttentionBackend): @@ -114,54 +111,3 @@ def test_model_runner_input(): assert (received_model_input.sampling_metadata.selected_token_indices == sampling_metadata.selected_token_indices) assert received_model_input.sampling_metadata.seq_groups is None - - -def test_embedding_model_runner_input(): - pooling_metadata = PoolingMetadata( - seq_groups=[[0]], - seq_data={}, - prompt_lens=[1], - ) - attn_metadata = AttentionMetadata( - num_prefills=1, - num_prefill_tokens=2, - num_decode_tokens=3, - slot_mapping=torch.zeros(1), - multi_modal_placeholder_index_maps=None, - enable_kv_scales_calculation=True, - ) - model_input = ModelInputForGPUWithPoolingMetadata( - input_tokens=torch.ones(10), - input_positions=torch.ones(10), - pooling_metadata=pooling_metadata, - attn_metadata=attn_metadata) - - assert isinstance(model_input, ModelInputForGPUWithPoolingMetadata) - - # Test round trip serialization. - tensor_dict = model_input.as_broadcastable_tensor_dict() - attn_backend = MockAttentionBackend() - received_model_input = ( - ModelInputForGPUWithPoolingMetadata.from_broadcasted_tensor_dict( - tensor_dict, attn_backend=attn_backend)) - # Check that received copy has correct values. - assert isinstance(received_model_input, - ModelInputForGPUWithPoolingMetadata) - assert received_model_input.input_tokens is not None - assert ( - received_model_input.input_tokens == model_input.input_tokens).all() - assert received_model_input.input_positions is not None - assert (received_model_input.input_positions == model_input.input_positions - ).all() - assert received_model_input.multi_modal_kwargs is None - assert (received_model_input.multi_modal_kwargs == - model_input.multi_modal_kwargs) - assert received_model_input.lora_requests is None - assert received_model_input.lora_requests == model_input.lora_requests - assert received_model_input.lora_mapping is None - assert received_model_input.lora_mapping == model_input.lora_mapping - for field in dataclasses.fields(AttentionMetadata): - assert getattr(received_model_input.attn_metadata, field.name, - None) == getattr(attn_metadata, field.name, None) - # Pooling metadata is not broadcast. - assert received_model_input.pooling_metadata is None diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index c89f3f6632642..d7864293e9647 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -1591,7 +1591,6 @@ class Scheduler: encoder_seq_data=encoder_seq_data, cross_block_table=cross_block_table, state=seq_group.state, - token_type_ids=seq_group.token_type_ids, # `multi_modal_data` will only be present for the 1st comm # between engine and worker. # the subsequent comms can still use delta, but diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 7802802f138b7..06bd97dd6abe9 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1566,8 +1566,7 @@ class EngineArgs: use_spec_decode = self.speculative_config is not None if (is_gpu and not use_sliding_window and not use_spec_decode - and not self.enable_lora - and model_config.runner_type != "pooling"): + and not self.enable_lora): self.enable_chunked_prefill = True logger.warning( "Chunked prefill is enabled by default for models " @@ -1585,10 +1584,6 @@ class EngineArgs: "OOM during the initial memory profiling phase, or result " "in low performance due to small KV cache size. Consider " "setting --max-model-len to a smaller value.", max_model_len) - elif (self.enable_chunked_prefill - and model_config.runner_type == "pooling"): - msg = "Chunked prefill is not supported for pooling models" - raise ValueError(msg) # if using prefix caching, we must set a hash algo if self.enable_prefix_caching: diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 4fb028627a8c4..9f9ad1854c3b6 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -72,8 +72,8 @@ STOP_ITERATION = Exception() # Sentinel class AsyncStream: - """A stream of RequestOutputs or PoolingRequestOutputs for a request - that can be iterated over asynchronously via an async generator.""" + """A stream of RequestOutputs for a request that can be iterated over + asynchronously via an async generator.""" def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None: self.request_id = request_id @@ -81,8 +81,7 @@ class AsyncStream: self._queue: asyncio.Queue = asyncio.Queue() self._finished = False - def put(self, item: Union[RequestOutput, PoolingRequestOutput, - Exception]) -> None: + def put(self, item: Union[RequestOutput, Exception]) -> None: if not self._finished: self._queue.put_nowait(item) @@ -99,9 +98,7 @@ class AsyncStream: def finished(self) -> bool: return self._finished - async def generator( - self - ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]: + async def generator(self) -> AsyncGenerator[RequestOutput, None]: try: while True: result = await self._queue.get() @@ -151,8 +148,7 @@ class RequestTracker: self.abort_request(rid, exception=exc) def process_request_output(self, - request_output: Union[RequestOutput, - PoolingRequestOutput], + request_output: RequestOutput, *, verbose: bool = False) -> None: """Process a request output from the engine.""" @@ -261,9 +257,7 @@ class _AsyncLLMEngine(LLMEngine): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - async def step_async( - self, virtual_engine: int - ) -> List[Union[RequestOutput, PoolingRequestOutput]]: + async def step_async(self, virtual_engine: int) -> List[RequestOutput]: """Performs one decoding iteration and returns newly generated results. The workers are ran asynchronously if possible. @@ -405,7 +399,7 @@ class _AsyncLLMEngine(LLMEngine): self, request_id: str, prompt: PromptType, - params: Union[SamplingParams, PoolingParams], + params: SamplingParams, arrival_time: Optional[float] = None, lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, @@ -779,14 +773,14 @@ class AsyncLLMEngine(EngineClient): self, request_id: str, prompt: PromptType, - params: Union[SamplingParams, PoolingParams], + params: SamplingParams, arrival_time: Optional[float] = None, lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, priority: int = 0, data_parallel_rank: Optional[int] = None, tokenization_kwargs: Optional[dict[str, Any]] = None, - ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]: + ) -> AsyncGenerator[RequestOutput, None]: if not self.is_running: if self.start_engine_loop: self.start_background_loop() @@ -908,7 +902,7 @@ class AsyncLLMEngine(EngineClient): await self.abort(request_id) raise - async def encode( + def encode( self, prompt: PromptType, pooling_params: PoolingParams, @@ -918,85 +912,8 @@ class AsyncLLMEngine(EngineClient): priority: int = 0, tokenization_kwargs: Optional[dict[str, Any]] = None, ) -> AsyncGenerator[PoolingRequestOutput, None]: - """Generate outputs for a request from a pooling model. - - Generate outputs for a request. This method is a coroutine. It adds the - request into the waiting queue of the LLMEngine and streams the outputs - from the LLMEngine to the caller. - - Args: - prompt: The prompt to the LLM. See - [`PromptType`][vllm.inputs.PromptType] for more details about - the format of each input. - pooling_params: The pooling parameters of the request. - request_id: The unique id of the request. - lora_request: LoRA request to use for generation, if any. - trace_headers: OpenTelemetry trace headers. - priority: The priority of the request. - Only applicable with priority scheduling. - - Yields: - The output `PoolingRequestOutput` objects from the LLMEngine - for the request. - - Details: - - If the engine is not running, start the background loop, - which iteratively invokes - [`vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`][] - to process the waiting requests. - - Add the request to the engine's `RequestTracker`. - On the next background loop, this request will be sent to - the underlying engine. - Also, a corresponding `AsyncStream` will be created. - - Wait for the request outputs from `AsyncStream` and yield them. - - Example: - ``` - # Please refer to entrypoints/api_server.py for - # the complete example. - - # initialize the engine and the example input - # note that engine_args here is AsyncEngineArgs instance - engine = AsyncLLMEngine.from_engine_args(engine_args) - example_input = { - "input": "What is LLM?", - "request_id": 0, - } - - # start the generation - results_generator = engine.encode( - example_input["input"], - PoolingParams(), - example_input["request_id"]) - - # get the results - final_output = None - async for request_output in results_generator: - if await request.is_disconnected(): - # Abort the request if the client disconnects. - await engine.abort(request_id) - # Return or raise an error - ... - final_output = request_output - - # Process and return the final output - ... - ``` - """ - try: - async for output in await self.add_request( - request_id, - prompt, - pooling_params, - lora_request=lora_request, - trace_headers=trace_headers, - priority=priority, - tokenization_kwargs=tokenization_kwargs, - ): - yield LLMEngine.validate_output(output, PoolingRequestOutput) - except asyncio.CancelledError: - await self.abort(request_id) - raise + raise NotImplementedError( + "Pooling models are not supported in vLLM V0") async def abort(self, request_id: Union[str, Iterable[str]]) -> None: """Abort a request. @@ -1104,8 +1021,8 @@ class AsyncLLMEngine(EngineClient): async def is_sleeping(self) -> bool: return self.engine.is_sleeping() - async def add_lora(self, lora_request: LoRARequest) -> None: - self.engine.add_lora(lora_request) + async def add_lora(self, lora_request: LoRARequest) -> bool: + return self.engine.add_lora(lora_request) async def collective_rpc(self, method: str, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 03c2f0375da42..7a5130af0bbbf 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -40,12 +40,11 @@ from vllm.multimodal.cache import processor_only_cache_from_config from vllm.multimodal.processing import EncDecMultiModalProcessor from vllm.outputs import (PoolingRequestOutput, RequestOutput, RequestOutputFactory) -from vllm.pooling_params import PoolingParams from vllm.sampling_params import RequestOutputKind, SamplingParams from vllm.sequence import (ExecuteModelRequest, ParallelSampleSequenceGroup, - PoolingSequenceGroupOutput, Sequence, SequenceGroup, - SequenceGroupBase, SequenceGroupMetadata, - SequenceGroupOutput, SequenceStatus) + Sequence, SequenceGroup, SequenceGroupBase, + SequenceGroupMetadata, SequenceGroupOutput, + SequenceStatus) from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context, init_tracer) from vllm.transformers_utils.detokenizer import Detokenizer @@ -93,8 +92,7 @@ class SchedulerContext: def __init__(self) -> None: self.output_queue: Deque[OutputData] = deque() - self.request_outputs: List[Union[RequestOutput, - PoolingRequestOutput]] = [] + self.request_outputs: List[RequestOutput] = [] self.seq_group_metadata_list: Optional[ List[SequenceGroupMetadata]] = None self.scheduler_outputs: Optional[SchedulerOutputs] = None @@ -261,8 +259,7 @@ class LLMEngine: self.model_executor = executor_class(vllm_config=vllm_config) - if self.model_config.runner_type != "pooling": - self._initialize_kv_caches() + self._initialize_kv_caches() # If usage stat is enabled, collect relevant info. if is_usage_stats_enabled(): @@ -541,7 +538,7 @@ class LLMEngine: self, request_id: str, processed_inputs: ProcessorInputs, - params: Union[SamplingParams, PoolingParams], + params: SamplingParams, arrival_time: float, lora_request: Optional[LoRARequest], trace_headers: Optional[Mapping[str, str]] = None, @@ -577,7 +574,7 @@ class LLMEngine: encoder_seq = (None if encoder_inputs is None else Sequence( seq_id, encoder_inputs, block_size, eos_token_id, lora_request)) - # Create a SequenceGroup based on SamplingParams or PoolingParams + # Create a SequenceGroup based on SamplingParams if isinstance(params, SamplingParams): seq_group = self._create_sequence_group_with_sampling( request_id, @@ -588,18 +585,8 @@ class LLMEngine: trace_headers=trace_headers, encoder_seq=encoder_seq, priority=priority) - elif isinstance(params, PoolingParams): - seq_group = self._create_sequence_group_with_pooling( - request_id, - seq, - params, - arrival_time=arrival_time, - lora_request=lora_request, - encoder_seq=encoder_seq, - priority=priority) else: - raise ValueError( - "Either SamplingParams or PoolingParams must be provided.") + raise ValueError("SamplingParams must be provided.") # Add the sequence group to the scheduler with least unfinished seqs. costs = [ @@ -618,7 +605,7 @@ class LLMEngine: self, request_id: str, prompt: PromptType, - params: Union[SamplingParams, PoolingParams], + params: SamplingParams, arrival_time: Optional[float] = None, lora_request: Optional[LoRARequest] = None, tokenization_kwargs: Optional[dict[str, Any]] = None, @@ -636,9 +623,8 @@ class LLMEngine: prompt: The prompt to the LLM. See [PromptType][vllm.inputs.PromptType] for more details about the format of each input. - params: Parameters for sampling or pooling. + params: Parameters for sampling. [SamplingParams][vllm.SamplingParams] for text generation. - [PoolingParams][vllm.PoolingParams] for pooling. arrival_time: The arrival time of the request. If None, we use the current monotonic time. lora_request: The LoRA request to add. @@ -760,29 +746,6 @@ class LLMEngine: return seq_group - def _create_sequence_group_with_pooling( - self, - request_id: str, - seq: Sequence, - pooling_params: PoolingParams, - arrival_time: float, - lora_request: Optional[LoRARequest], - encoder_seq: Optional[Sequence] = None, - priority: int = 0, - ) -> SequenceGroup: - """Creates a SequenceGroup with PoolingParams.""" - # Defensive copy of PoolingParams, which are used by the pooler - pooling_params = pooling_params.clone() - # Create the sequence group. - seq_group = SequenceGroup(request_id=request_id, - seqs=[seq], - arrival_time=arrival_time, - lora_request=lora_request, - pooling_params=pooling_params, - encoder_seq=encoder_seq, - priority=priority) - return seq_group - def abort_request(self, request_id: Union[str, Iterable[str]]) -> None: """Aborts a request(s) with the given ID. @@ -856,18 +819,6 @@ class LLMEngine: success = success and scheduler.reset_prefix_cache(device) return success - @staticmethod - def _process_sequence_group_outputs( - seq_group: SequenceGroup, - outputs: List[PoolingSequenceGroupOutput], - ) -> None: - seq_group.pooled_data = outputs[0].data - - for seq in seq_group.get_seqs(): - seq.status = SequenceStatus.FINISHED_STOPPED - - return - def _process_model_outputs(self, ctx: SchedulerContext, request_id: Optional[str] = None) -> None: @@ -962,13 +913,10 @@ class LLMEngine: seq_group.metrics.model_execute_time = ( o.model_execute_time) - if self.model_config.runner_type == "pooling": - self._process_sequence_group_outputs(seq_group, output) - else: - self.output_processor.process_prompt_logprob(seq_group, output) - if seq_group_meta.do_sample: - self.output_processor.process_outputs( - seq_group, output, is_async) + self.output_processor.process_prompt_logprob(seq_group, output) + if seq_group_meta.do_sample: + self.output_processor.process_outputs(seq_group, output, + is_async) if seq_group.is_finished(): finished_now.append(i) @@ -1090,7 +1038,7 @@ class LLMEngine: seq.append_token_id(sample.output_token, sample.logprobs, sample.output_embed) - def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]: + def step(self) -> List[RequestOutput]: """Performs one decoding iteration and returns newly generated results.
diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py index ff0405d2f843e..9f64ee0808df2 100644 --- a/vllm/engine/multiprocessing/__init__.py +++ b/vllm/engine/multiprocessing/__init__.py @@ -120,6 +120,7 @@ class RPCLoadAdapterRequest: @dataclass class RPCAdapterLoadedResponse: request_id: str + lora_loaded: bool RPC_REQUEST_T = Union[RPCProcessRequest, RPCAbortRequest, RPCStartupRequest, diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index 0bb11328b1db5..2d3248859c940 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -6,7 +6,7 @@ import copy import pickle from contextlib import contextmanager, suppress from typing import (Any, AsyncGenerator, Dict, Iterable, Iterator, List, - Mapping, Optional, Union, cast) + Mapping, Optional, Union) import cloudpickle import psutil @@ -477,10 +477,8 @@ class MQLLMEngineClient(EngineClient): Any priority other than 0 will lead to an error if the scheduling policy is not "priority". """ - return cast( - AsyncGenerator[RequestOutput, None], - self._process_request(prompt, sampling_params, request_id, - lora_request, trace_headers, priority)) + return self._process_request(prompt, sampling_params, request_id, + lora_request, trace_headers, priority) def encode( self, @@ -490,45 +488,20 @@ class MQLLMEngineClient(EngineClient): lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, priority: int = 0, + tokenization_kwargs: Optional[dict[str, Any]] = None, ) -> AsyncGenerator[PoolingRequestOutput, None]: - """Generate outputs for a request from a pooling model. - - Generate outputs for a request. This method is a coroutine. It adds the - request into the waiting queue of the LLMEngine and streams the outputs - from the LLMEngine to the caller. - - Args: - prompt: The prompt to the LLM. See - [`PromptType`][vllm.inputs.PromptType] for more details about - the format of each input. - pooling_params: The pooling parameters of the request. - request_id: The unique id of the request. - lora_request: LoRA request to use for generation, if any. - trace_headers: OpenTelemetry trace headers. - - Yields: - The output `PoolingRequestOutput` objects from the LLMEngine - for the request. - """ - return cast( - AsyncGenerator[PoolingRequestOutput, None], - self._process_request(prompt, - pooling_params, - request_id, - lora_request, - trace_headers, - priority=priority)) + raise NotImplementedError( + "Pooling models are not supported in vLLM V0") async def _process_request( self, prompt: PromptType, - params: Union[SamplingParams, PoolingParams], + params: SamplingParams, request_id: str, lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, priority: int = 0, - ) -> Union[AsyncGenerator[RequestOutput, None], AsyncGenerator[ - PoolingRequestOutput, None]]: + ) -> AsyncGenerator[RequestOutput, None]: """Send an RPCGenerateRequest to the RPCServer and stream responses.""" # If already dead, error out. @@ -547,7 +520,7 @@ class MQLLMEngineClient(EngineClient): try: # 2) Detach logits processors so that they can be pickled # separately (may require cloudpickle which is slower) - if isinstance(params, SamplingParams) and params.logits_processors: + if params.logits_processors: # Defensive shallow copy params = copy.copy(params) logits_processors = params.logits_processors @@ -646,13 +619,14 @@ class MQLLMEngineClient(EngineClient): raise request_output return request_output.is_sleeping - async def add_lora(self, lora_request: LoRARequest) -> None: + async def add_lora(self, lora_request: LoRARequest) -> bool: """Load a new LoRA adapter into the engine for future requests.""" # Uses the same I/O as generate requests request = RPCLoadAdapterRequest(lora_request) # Create output queue for this request. - queue: asyncio.Queue[Union[None, BaseException]] = asyncio.Queue() + queue: asyncio.Queue[Union[ + BaseException, RPCAdapterLoadedResponse]] = asyncio.Queue() self.output_queues[request.request_id] = queue # Send the request @@ -666,3 +640,4 @@ class MQLLMEngineClient(EngineClient): # Raise on error, otherwise happily return None if isinstance(request_output, BaseException): raise request_output + return request_output.lora_loaded diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index 903f3fd71ebcd..343b8df7e87bd 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -347,7 +347,7 @@ class MQLLMEngine: def _handle_load_adapter_request(self, request: RPCLoadAdapterRequest): try: - self.engine.add_lora(request.lora_request) + lora_loaded = self.engine.add_lora(request.lora_request) except BaseException as e: # Send back an error if the adater fails to load rpc_err = RPCError(request_id=request.request_id, @@ -357,7 +357,8 @@ class MQLLMEngine: return # Otherwise, send back the successful load message self._send_outputs( - RPCAdapterLoadedResponse(request_id=request.request_id)) + RPCAdapterLoadedResponse(request_id=request.request_id, + lora_loaded=lora_loaded)) def _handle_is_sleeping_request(self, request: RPCIsSleepingRequest): is_sleeping = self.is_sleeping() diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 5e8ac9c0b3987..31c36b8562317 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -3,7 +3,7 @@ import asyncio from abc import ABC, abstractmethod -from typing import AsyncGenerator, Iterable, Mapping, Optional, Union +from typing import Any, AsyncGenerator, Iterable, Mapping, Optional, Union from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function from vllm.config import DecodingConfig, ModelConfig, VllmConfig @@ -224,6 +224,7 @@ class EngineClient(ABC): lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, priority: int = 0, + tokenization_kwargs: Optional[dict[str, Any]] = None, ) -> AsyncGenerator[PoolingRequestOutput, None]: """Generate outputs for a request from a pooling model.""" ... @@ -320,7 +321,7 @@ class EngineClient(ABC): ... @abstractmethod - async def add_lora(self, lora_request: LoRARequest) -> None: + async def add_lora(self, lora_request: LoRARequest) -> bool: """Load a new LoRA adapter into the engine for future requests.""" ... diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 72b6123670b70..9d900e691b0a0 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1156,8 +1156,7 @@ class LLM: tokenization_kwargs=tokenization_kwargs, ) - if envs.VLLM_USE_V1 and (token_type_ids := engine_prompt.pop( - "token_type_ids", None)): + if (token_type_ids := engine_prompt.pop("token_type_ids", None)): params = pooling_params.clone() compressed = compress_token_type_ids(token_type_ids) params.extra_kwargs = {"compressed_token_type_ids": compressed} diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index 37838e22a4002..c54deb371d545 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -7,7 +7,6 @@ from typing import Any, Optional, Union from fastapi import Request -from vllm import envs from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient from vllm.entrypoints.logger import RequestLogger @@ -229,8 +228,7 @@ class ServingScores(OpenAIServing): params=default_pooling_params, lora_request=lora_request) - if envs.VLLM_USE_V1 and (token_type_ids := engine_prompt.pop( - "token_type_ids", None)): + if (token_type_ids := engine_prompt.pop("token_type_ids", None)): pooling_params = default_pooling_params.clone() compressed = compress_token_type_ids(token_type_ids) pooling_params.extra_kwargs = { diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index 23cb5e5022f19..8e6d3136d5e9b 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -174,9 +174,6 @@ class TokenInputs(TypedDict): prompt_token_ids: list[int] """The token IDs of the prompt.""" - token_type_ids: NotRequired[list[int]] - """The token type IDs of the prompt.""" - prompt: NotRequired[str] """ The original prompt text corresponding to the token IDs, if available. @@ -190,7 +187,6 @@ class TokenInputs(TypedDict): def token_inputs( prompt_token_ids: list[int], - token_type_ids: Optional[list[int]] = None, prompt: Optional[str] = None, cache_salt: Optional[str] = None, ) -> TokenInputs: @@ -200,8 +196,6 @@ def token_inputs( if prompt is not None: inputs["prompt"] = prompt - if token_type_ids is not None: - inputs["token_type_ids"] = token_type_ids if cache_salt is not None: inputs["cache_salt"] = cache_salt diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index fff9c42fe36fe..3dbd9057fe0f7 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -355,7 +355,6 @@ class InputPreprocessor: mm_hash_overrides: Optional[dict[str, list[str]]] = None, ) -> Union[TokenInputs, MultiModalInputs]: prompt_token_ids = parsed_content["prompt_token_ids"] - token_type_ids = parsed_content.get("token_type_ids") inputs: Union[TokenInputs, MultiModalInputs] if multi_modal_data := parsed_content.get("multi_modal_data"): @@ -368,10 +367,7 @@ class InputPreprocessor: mm_hash_overrides=mm_hash_overrides, ) else: - inputs = token_inputs( - prompt_token_ids=prompt_token_ids, - token_type_ids=token_type_ids, - ) + inputs = token_inputs(prompt_token_ids=prompt_token_ids) if cache_salt := parsed_content.get("cache_salt"): inputs["cache_salt"] = cache_salt @@ -387,7 +383,6 @@ class InputPreprocessor: mm_hash_overrides: Optional[dict[str, list[str]]] = None, ) -> Union[TokenInputs, MultiModalInputs]: prompt_token_ids = parsed_content["prompt_token_ids"] - token_type_ids = parsed_content.get("token_type_ids") inputs: Union[TokenInputs, MultiModalInputs] if multi_modal_data := parsed_content.get("multi_modal_data"): @@ -400,10 +395,7 @@ class InputPreprocessor: mm_hash_overrides=mm_hash_overrides, ) else: - inputs = token_inputs( - prompt_token_ids=prompt_token_ids, - token_type_ids=token_type_ids, - ) + inputs = token_inputs(prompt_token_ids=prompt_token_ids, ) if cache_salt := parsed_content.get("cache_salt"): inputs["cache_salt"] = cache_salt diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index eebf7b2508dbc..66101e1a99243 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -13,17 +13,12 @@ import torch.nn.functional as F from transformers import PretrainedConfig from vllm.config import ModelConfig, PoolerConfig -from vllm.model_executor.pooling_metadata import ( # noqa: E501 - PoolingMetadata as V0PoolingMetadata) -from vllm.model_executor.pooling_metadata import PoolingTensors from vllm.pooling_params import PoolingParams from vllm.sequence import PoolerOutput, PoolingSequenceGroupOutput from vllm.tasks import PoolingTask from vllm.utils import current_stream, resolve_obj_by_qualname -from vllm.v1.pool.metadata import PoolingCursor -from vllm.v1.pool.metadata import PoolingMetadata as V1PoolingMetadata +from vllm.v1.pool.metadata import PoolingCursor, PoolingMetadata -PoolingMetadata = Union[V0PoolingMetadata, V1PoolingMetadata] PoolingFn = Callable[ [Union[torch.Tensor, list[torch.Tensor]], PoolingMetadata], Union[torch.Tensor, list[torch.Tensor]]] @@ -127,36 +122,23 @@ def get_prompt_lens( hidden_states: Union[torch.Tensor, list[torch.Tensor]], pooling_metadata: PoolingMetadata, ) -> torch.Tensor: - if isinstance(pooling_metadata, V1PoolingMetadata): - return pooling_metadata.prompt_lens - - return PoolingTensors.from_pooling_metadata( - pooling_metadata, hidden_states[0].device).prompt_lens + return pooling_metadata.prompt_lens def get_prompt_token_ids( pooling_metadata: PoolingMetadata) -> list[torch.Tensor]: - if isinstance(pooling_metadata, V1PoolingMetadata): - assert pooling_metadata.prompt_token_ids is not None, ( - "Please set `requires_token_ids=True` in `get_pooling_updates`") - - return [ - pooling_metadata.prompt_token_ids[i, :num] - for i, num in enumerate(pooling_metadata.prompt_lens) - ] + assert pooling_metadata.prompt_token_ids is not None, ( + "Please set `requires_token_ids=True` in `get_pooling_updates`") return [ - torch.tensor(seq_data_i.prompt_token_ids) - for seq_data_i in pooling_metadata.seq_data.values() + pooling_metadata.prompt_token_ids[i, :num] + for i, num in enumerate(pooling_metadata.prompt_lens) ] def get_pooling_params( pooling_metadata: PoolingMetadata) -> list[PoolingParams]: - if isinstance(pooling_metadata, V0PoolingMetadata): - pooling_params = [p for _, p in pooling_metadata.seq_groups] - else: - pooling_params = pooling_metadata.pooling_params + pooling_params = pooling_metadata.pooling_params return pooling_params diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index b34ca5cbe963d..8f23439655ed7 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -24,9 +24,9 @@ from vllm.model_executor.layers.pooler import (ClassifierPooler, from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) -from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.sequence import IntermediateTensors from vllm.tasks import PoolingTask +from vllm.v1.pool.metadata import PoolingMetadata from .interfaces import SupportsCrossEncoding, SupportsQuant from .interfaces_base import default_pooling_type diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py index 1b3d541c65cf8..a7b324f0a5b4c 100644 --- a/vllm/model_executor/models/gritlm.py +++ b/vllm/model_executor/models/gritlm.py @@ -15,10 +15,10 @@ from vllm.model_executor.layers.pooler import (DispatchPooler, Pooler, build_output, get_prompt_lens, get_prompt_token_ids) from vllm.model_executor.models.llama import LlamaForCausalLM -from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.sequence import PoolerOutput from vllm.tasks import PoolingTask from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config +from vllm.v1.pool.metadata import PoolingMetadata from .interfaces_base import default_pooling_type diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index 4778555861286..776287589808a 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -22,9 +22,9 @@ from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.sequence import IntermediateTensors from vllm.tasks import PoolingTask +from vllm.v1.pool.metadata import PoolingMetadata from .interfaces import SupportsCrossEncoding from .interfaces_base import default_pooling_type diff --git a/vllm/model_executor/pooling_metadata.py b/vllm/model_executor/pooling_metadata.py deleted file mode 100644 index 3209879193453..0000000000000 --- a/vllm/model_executor/pooling_metadata.py +++ /dev/null @@ -1,90 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from dataclasses import dataclass -from typing import Any, Optional - -import torch - -from vllm.pooling_params import PoolingParams -from vllm.utils import is_pin_memory_available -from vllm.v1.pool.metadata import PoolingCursor, build_pooling_cursor - - -class PoolingMetadata: - """Metadata for pooling operations in the Pooler layer. - - This class holds the necessary information for pooling operations, - providing context for how to perform pooling and other related operations. - - Attributes: - seq_groups: List of (seq_ids, pooling_params). - seq_data: A mapping of sequence ID to additional sequence data. - prompt_lens: List of the lengths of each prompt. - """ - - def __init__( - self, - seq_groups: list[tuple[list[int], PoolingParams]], - seq_data: dict[int, Any], # Specific data related to sequences - prompt_lens: list[int], - pooling_cursor: Optional[PoolingCursor] = None) -> None: - self.seq_groups = seq_groups - self.seq_data = seq_data - self.prompt_lens = prompt_lens - self.pooling_cursor: Optional[PoolingCursor] = pooling_cursor - - def __repr__(self) -> str: - return ("PoolingMetadata(" - f"seq_groups={self.seq_groups}, " - f"seq_data={self.seq_data}, " - f"prompt_lens={self.prompt_lens})") - - def __getitem__(self, indices: slice): - return PoolingMetadata( - seq_groups=self.seq_groups[indices], - seq_data=dict(list(self.seq_data.items())[indices]), - prompt_lens=self.prompt_lens[indices], - pooling_cursor=None - if self.pooling_cursor is None else self.pooling_cursor[indices], - ) - - def build_pooling_cursor(self, num_scheduled_tokens: list[int], - device: torch.device): - prompt_lens = torch.tensor(self.prompt_lens, device="cpu") - self.pooling_cursor = build_pooling_cursor(num_scheduled_tokens, - prompt_lens, - device=device) - - -@dataclass -class PoolingTensors: - """Tensors for pooling.""" - - prompt_lens: torch.Tensor - - @classmethod - def from_pooling_metadata( - cls, - pooling_metadata: "PoolingMetadata", - device: torch.device, - ) -> "PoolingTensors": - """ - Create PoolingTensors from PoolingMetadata. - - Args: - pooling_metadata: PoolingMetadata instance to convert. - device: Device to store the tensors. - """ - # Convert prompt lengths to tensor - pin_memory = is_pin_memory_available() - - prompt_lens_t = torch.tensor( - pooling_metadata.prompt_lens, - device="cpu", - dtype=torch.long, - pin_memory=pin_memory, - ) - - return cls(prompt_lens=prompt_lens_t.to(device=device, - non_blocking=True), ) diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 2c0ebaced67ef..cf6ab6c8dea2d 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -913,9 +913,6 @@ class MultiModalInputs(TypedDict): prompt_token_ids: list[int] """The processed token IDs which includes placeholder tokens.""" - token_type_ids: NotRequired[list[int]] - """The token type IDs of the prompt.""" - mm_kwargs: MultiModalKwargsOptionalItems """Keyword arguments to be directly passed to the model after batching.""" @@ -946,6 +943,3 @@ class MultiModalEncDecInputs(MultiModalInputs): encoder_prompt_token_ids: list[int] """The processed token IDs of the encoder prompt.""" - - encoder_token_type_ids: NotRequired[list[int]] - """The token type IDs of the encoder prompt.""" diff --git a/vllm/sequence.py b/vllm/sequence.py index 36b1b198bd5a5..7b48b7be9f511 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -508,12 +508,6 @@ class Sequence: return [0] * len(self.inputs["prompt_embeds"]) return self.inputs["prompt_token_ids"] - @property - def token_type_ids(self) -> list[int]: - if self.inputs["type"] == "embeds": - return [] - return self.inputs.get("token_type_ids", []) - @property def multi_modal_data(self) -> MultiModalKwargs: if self.inputs["type"] == "multimodal": @@ -765,10 +759,6 @@ class SequenceGroup: return (self.encoder_seq.prompt_token_ids if self.encoder_seq is not None else None) - @property - def token_type_ids(self) -> Optional[list[int]]: - return self.first_seq.token_type_ids - @property def multi_modal_data(self) -> MultiModalKwargs: if self.first_seq.multi_modal_data: @@ -972,7 +962,6 @@ class SequenceGroupMetadata( computed_block_nums: Optional[list[int]] = None state: Optional[SequenceGroupState] = msgspec.field( default_factory=lambda: SequenceGroupState()) - token_type_ids: Optional[list[int]] = None multi_modal_data: Optional[MultiModalKwargs] = None multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None encoder_seq_data: Optional[SequenceData] = None diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index cb5d5664ab5c0..12fd25f4de2ad 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -24,8 +24,7 @@ from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs, MultiModalRegistry) from vllm.platforms import _Backend from vllm.sampling_params import SamplingParams -from vllm.sequence import (IntermediateTensors, PoolerOutput, - SequenceGroupMetadata) +from vllm.sequence import IntermediateTensors, SequenceGroupMetadata from vllm.utils import STR_NOT_IMPL_ENC_DEC_BACKEND, make_tensor_with_pad from vllm.worker.model_runner import (GPUModelRunnerBase, ModelInputForGPUBuilder, @@ -161,7 +160,7 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]): kv_caches: List[torch.Tensor], intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, - ) -> Optional[List[PoolerOutput]]: + ) -> Optional[List[SamplerOutput]]: if num_steps > 1: raise ValueError("num_steps > 1 is not supported in " "EncoderDecoderModelRunner") diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index a1c08fa814db4..f05401fd01327 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -86,7 +86,6 @@ class ModelInputForGPU(ModelRunnerInputBase): input_tokens: Optional[torch.Tensor] = None inputs_embeds: Optional[torch.Tensor] = None input_positions: Optional[torch.Tensor] = None - token_types: Optional[torch.Tensor] = None seq_lens: Optional[List[int]] = None query_lens: Optional[List[int]] = None lora_mapping: Optional["LoRAMapping"] = None @@ -192,7 +191,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): self.input_tokens[0].clear() # type: ignore self.inputs_embeds = None # type: ignore self.input_positions[0].clear() # type: ignore - self.token_types[0].clear() # type: ignore self.mrope_input_positions = None # type: ignore self.seq_lens[0] = 0 # type: ignore self.orig_seq_lens[0] = 0 # type: ignore @@ -219,7 +217,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): input_tokens: Optional[List[List[int]]] = None, inputs_embeds: Optional[torch.Tensor] = None, input_positions: Optional[List[List[int]]] = None, - token_types: Optional[List[List[int]]] = None, mrope_input_positions: Optional[List[List[List[int]]]] = None, # The sequence length (may be capped to the sliding window). @@ -284,12 +281,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): for seq_id in range(len(self.seq_ids)): self.input_positions[seq_id].clear() - if token_types: - self.token_types = token_types - else: - for seq_id in range(len(self.seq_ids)): - self.token_types[seq_id].clear() - self.mrope_input_positions = None if seq_lens: @@ -348,7 +339,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): self.input_tokens = input_tokens or [] self.inputs_embeds = inputs_embeds self.input_positions = input_positions or [] - self.token_types = token_types or [] self.mrope_input_positions = mrope_input_positions or None self.seq_lens = seq_lens or [] self.orig_seq_lens = orig_seq_lens or [] @@ -376,7 +366,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): self.input_tokens = [[] for _ in range(self.n_seqs)] self.input_positions = [[] for _ in range(self.n_seqs)] - self.token_types = [[] for _ in range(self.n_seqs)] self.mrope_input_positions = None self.seq_lens = [0] * self.n_seqs self.orig_seq_lens = [0] * self.n_seqs @@ -400,7 +389,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): f"inputs_embeds.shape=" f"{getattr(self.inputs_embeds, 'shape', None)}, " f"input_positions={self.input_positions}, " - f"token_types={self.token_types}, " f"mrope_input_positions={self.mrope_input_positions}, " f"seq_lens={self.seq_lens}, " f"orig_seq_lens={self.orig_seq_lens}, " @@ -522,8 +510,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): prompt_embeds = seq_data.get_token_embeddings( )[context_len:seq_len] - token_types = seq_group_metadata.token_type_ids - inter_data.seq_lens[seq_idx] = seq_len inter_data.orig_seq_lens[seq_idx] = seq_len inter_data.prompt_lens[seq_idx] = seq_data.get_prompt_len() @@ -531,8 +517,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): inter_data.input_tokens[seq_idx].extend(tokens) inter_data.inputs_embeds = prompt_embeds inter_data.input_positions[seq_idx].extend(range(context_len, seq_len)) - inter_data.token_types[seq_idx].extend( - token_types if token_types else []) inter_data.query_lens[seq_idx] = seq_len - context_len if seq_data.mrope_position_delta is not None: @@ -590,8 +574,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): seq_idx][uncomputed_start:] inter_data.input_positions[seq_idx] = inter_data.input_positions[ seq_idx][uncomputed_start:] - inter_data.token_types[seq_idx] = inter_data.token_types[seq_idx][ - uncomputed_start:] context_len = prefix_cache_len inter_data.context_lens[seq_idx] = context_len @@ -606,8 +588,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): seq_idx][-1:] inter_data.input_positions[seq_idx] = inter_data.input_positions[ seq_idx][-1:] - inter_data.token_types[seq_idx] = inter_data.token_types[seq_idx][ - -1:] inter_data.query_lens[seq_idx] = 1 inter_data.context_lens[seq_idx] = inter_data.seq_lens[seq_idx] - 1 @@ -802,12 +782,9 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): # Combine and flatten intermediate data. input_tokens = list[int]() inputs_embeds_list = list[torch.Tensor]() - token_types = list[int]() for inter_data in self.inter_data_list: for cur_input_tokens in inter_data.input_tokens: input_tokens.extend(cur_input_tokens) - for cur_token_types in inter_data.token_types: - token_types.extend(cur_token_types) if inter_data.inputs_embeds is not None: inputs_embeds_list.append( inter_data.inputs_embeds.to( @@ -890,11 +867,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): self.runner.device, self.runner.pin_memory) - token_types_tensor = async_tensor_h2d(token_types, torch.long, - self.runner.device, - self.runner.pin_memory) \ - if token_types else None - if mrope_input_positions is not None: for idx in range(3): mrope_input_positions[idx].extend( @@ -951,7 +923,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): input_tokens=input_tokens_tensor, inputs_embeds=inputs_embeds, input_positions=input_positions_tensor, - token_types=token_types_tensor, attn_metadata=attn_metadata, seq_lens=seq_lens, query_lens=query_lens, diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py index 7b8fe2f802d68..1008b743619a4 100644 --- a/vllm/worker/model_runner_base.py +++ b/vllm/worker/model_runner_base.py @@ -13,10 +13,9 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.models.interfaces import supports_transcription -from vllm.model_executor.models.interfaces_base import ( - is_pooling_model, is_text_generation_model) +from vllm.model_executor.models.interfaces_base import is_text_generation_model from vllm.sequence import IntermediateTensors, SequenceGroupMetadata -from vllm.tasks import GenerationTask, PoolingTask, SupportedTask +from vllm.tasks import GenerationTask, SupportedTask if TYPE_CHECKING: from vllm.attention import AttentionMetadata @@ -241,20 +240,11 @@ class ModelRunnerBase(ABC, Generic[T]): return supported_tasks - def get_supported_pooling_tasks(self) -> list[PoolingTask]: - model = self.get_model() - if not is_pooling_model(model): - return [] - - return list(model.pooler.get_supported_tasks()) - def get_supported_tasks(self) -> tuple[SupportedTask, ...]: tasks = list[SupportedTask]() if self.model_config.runner_type == "generate": tasks.extend(self.get_supported_generation_tasks()) - if self.model_config.runner_type == "pooling": - tasks.extend(self.get_supported_pooling_tasks()) return tuple(tasks) diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py deleted file mode 100644 index 3e1950798dbf6..0000000000000 --- a/vllm/worker/pooling_model_runner.py +++ /dev/null @@ -1,222 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import dataclasses -from typing import Any, Dict, List, Optional, Tuple, Type, Union, cast - -import torch - -from vllm.config import VllmConfig -from vllm.distributed import get_pp_group -from vllm.forward_context import set_forward_context -from vllm.logger import init_logger -from vllm.model_executor.models.interfaces_base import VllmModelForPooling -from vllm.model_executor.pooling_metadata import PoolingMetadata -from vllm.multimodal import MultiModalKwargs -from vllm.pooling_params import PoolingParams -from vllm.sequence import (IntermediateTensors, PoolerOutput, SequenceData, - SequenceGroupMetadata) -from vllm.worker.model_runner import (GPUModelRunnerBase, ModelInputForGPU, - ModelInputForGPUBuilder) - -logger = init_logger(__name__) - - -@dataclasses.dataclass(frozen=True) -class ModelInputForGPUWithPoolingMetadata(ModelInputForGPU): - """ - Used by the PoolingModelRunner. - """ - pooling_metadata: Optional["PoolingMetadata"] = None - - -class PoolingModelRunner( - GPUModelRunnerBase[ModelInputForGPUWithPoolingMetadata]): - _model_input_cls: Type[ModelInputForGPUWithPoolingMetadata] = ( - ModelInputForGPUWithPoolingMetadata) - _builder_cls: Type[ModelInputForGPUBuilder] = ModelInputForGPUBuilder - - def __init__( - self, - vllm_config: VllmConfig, - kv_cache_dtype: Optional[str] = "auto", - is_driver_worker: bool = False, - ): - super().__init__(vllm_config=vllm_config, - kv_cache_dtype=kv_cache_dtype, - is_driver_worker=is_driver_worker) - - @torch.inference_mode() - def execute_model( - self, - model_input: ModelInputForGPUWithPoolingMetadata, - kv_caches: List[torch.Tensor], - intermediate_tensors: Optional[IntermediateTensors] = None, - num_steps: int = 1, - ) -> Optional[Union[List[PoolerOutput], IntermediateTensors]]: - if num_steps > 1: - raise ValueError( - "PoolingModelRunner does not support multi-step execution.") - - if self.lora_config: - assert model_input.lora_requests is not None - assert model_input.lora_mapping is not None - self.set_active_loras(model_input.lora_requests, - model_input.lora_mapping) - - # Currently cuda graph is only supported by the decode phase. - assert model_input.attn_metadata is not None - prefill_meta = model_input.attn_metadata.prefill_metadata - decode_meta = model_input.attn_metadata.decode_metadata - virtual_engine = model_input.virtual_engine - # Pooling models are (ab-)used also to integrate non text models that - # are not autoregressive (PrithviGeosaptialMAE). - # These model might not use attention and do not really have a prefill - # and decode phase. The model input is processed in one shot and both - # decode_metadata and prefill_metadata would be None for such models. - # See the PlaceholderAttentionMetadata class. - # TODO: Figure out if cuda_graph is of any use for these models and - # explore how to leverage it. - if (prefill_meta is None and decode_meta is not None - and decode_meta.use_cuda_graph): - if model_input.inputs_embeds is None: - assert model_input.input_tokens is not None - graph_batch_size = model_input.input_tokens.shape[0] - model_executable = ( - self.graph_runners[model_input.virtual_engine][( - graph_batch_size, False)]) - else: - graph_batch_size = model_input.inputs_embeds.shape[0] - model_executable = ( - self.graph_runners[model_input.virtual_engine][( - graph_batch_size, True)]) - else: - model_executable = self.model - - multi_modal_kwargs = model_input.multi_modal_kwargs or {} - seqlen_agnostic_kwargs = { - "finished_requests_ids": model_input.finished_requests_ids, - "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids, - } if self.has_inner_state else {} - if (self.observability_config is not None - and self.observability_config.collect_model_forward_time): - model_forward_start = torch.cuda.Event(enable_timing=True) - model_forward_end = torch.cuda.Event(enable_timing=True) - model_forward_start.record() - - cross_enc_kwargs = {} - if model_input.token_types is not None: - cross_enc_kwargs["token_type_ids"] = model_input.token_types - - with set_forward_context(model_input.attn_metadata, self.vllm_config, - virtual_engine): - hidden_or_intermediate_states = model_executable( - input_ids=model_input.input_tokens, - positions=model_input.input_positions, - intermediate_tensors=intermediate_tensors, - **MultiModalKwargs.as_kwargs( - multi_modal_kwargs, - device=self.device, - ), - **cross_enc_kwargs, - **seqlen_agnostic_kwargs, - ) - - if (self.observability_config is not None - and self.observability_config.collect_model_forward_time): - model_forward_end.record() - - # Only perform pooling in the last pipeline stage. - if not get_pp_group().is_last_rank: - if (self.is_driver_worker - and hidden_or_intermediate_states is not None - and isinstance(hidden_or_intermediate_states, - IntermediateTensors) - and self.observability_config is not None - and self.observability_config.collect_model_forward_time): - model_forward_end.synchronize() - model_forward_time = model_forward_start.elapsed_time( - model_forward_end) - orig_model_forward_time = 0.0 - if intermediate_tensors is not None: - orig_model_forward_time = intermediate_tensors.tensors.get( - "model_forward_time", torch.tensor(0.0)).item() - hidden_or_intermediate_states.tensors["model_forward_time"] = ( - torch.tensor(model_forward_time + orig_model_forward_time)) - return hidden_or_intermediate_states - - # Only perform pooling in the driver worker. - if not self.is_driver_worker: - return [] - - pooling_metadata = model_input.pooling_metadata - assert pooling_metadata is not None - - pooling_metadata.build_pooling_cursor( - num_scheduled_tokens=pooling_metadata.prompt_lens, - device=hidden_or_intermediate_states.device) - - return [ - self.model.pooler(hidden_states=hidden_or_intermediate_states, - pooling_metadata=pooling_metadata) - ] - - def make_model_input_from_broadcasted_tensor_dict( - self, - tensor_dict: Dict[str, - Any]) -> ModelInputForGPUWithPoolingMetadata: - return ModelInputForGPUWithPoolingMetadata.from_broadcasted_tensor_dict( - tensor_dict, - attn_backend=self.attn_backend, - ) - - def prepare_model_input( - self, - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], - virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None - ) -> ModelInputForGPUWithPoolingMetadata: - assert seq_group_metadata_list is not None - model_input = self._prepare_model_input_tensors( - seq_group_metadata_list, finished_requests_ids) - # Prepare PoolingMetadata. - assert model_input.seq_lens is not None - pooling_metadata = self._prepare_pooling(seq_group_metadata_list, - model_input.seq_lens) - - return dataclasses.replace(model_input, - pooling_metadata=pooling_metadata) - - def _prepare_pooling( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - prompt_lens: List[int], - ) -> PoolingMetadata: - """Prepare PoolingMetadata for the sequence group metadata list.""" - seq_groups: List[Tuple[List[int], PoolingParams]] = [] - for i, seq_group_metadata in enumerate(seq_group_metadata_list): - seq_ids = list(seq_group_metadata.seq_data.keys()) - - pooling_params = seq_group_metadata.pooling_params - assert pooling_params is not None - - task = pooling_params.task - assert task is not None, "You did not set `task` in the API" - - model = cast(VllmModelForPooling, self.model) - to_update = model.pooler.get_pooling_updates(task) - to_update.apply(pooling_params) - - seq_groups.append((seq_ids, pooling_params)) - - seq_data: Dict[int, SequenceData] = {} - for seq_group_metadata in seq_group_metadata_list: - seq_data.update(seq_group_metadata.seq_data) - - pooling_metadata = PoolingMetadata( - seq_groups=seq_groups, - seq_data=seq_data, - prompt_lens=prompt_lens, - ) - - return pooling_metadata diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index fc24d95b80f2c..2e20c89c632c5 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -30,7 +30,6 @@ from vllm.utils import (GiB_bytes, MemorySnapshot, bind_kv_cache, from vllm.worker.cache_engine import CacheEngine from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner -from vllm.worker.pooling_model_runner import PoolingModelRunner from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase, WorkerInput) @@ -83,9 +82,7 @@ class Worker(LocalOrDistributedWorkerBase): else {"return_hidden_states": True} ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner - if model_config.runner_type == "pooling": - ModelRunnerClass = PoolingModelRunner - elif self.model_config.is_encoder_decoder: + if self.model_config.is_encoder_decoder: ModelRunnerClass = EncoderDecoderModelRunner self.model_runner: GPUModelRunnerBase = ModelRunnerClass( vllm_config=self.vllm_config, @@ -99,7 +96,6 @@ class Worker(LocalOrDistributedWorkerBase): # Uninitialized cache engine. Will be initialized by # initialize_cache. self.cache_engine: List[CacheEngine] - # Initialize gpu_cache as pooling models don't initialize kv_caches self.gpu_cache: Optional[List[List[torch.Tensor]]] = None self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {} From ad39106b16fee0074e814f06ec7a517399ea154d Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Fri, 29 Aug 2025 17:19:58 +0800 Subject: [PATCH 23/56] [CPU] Enable data parallel for CPU backend (#23903) Signed-off-by: jiang1.li --- .../scripts/hardware_ci/run-cpu-test.sh | 24 +++++++++++++++---- docs/getting_started/installation/cpu.md | 3 ++- .../installation/cpu/x86.inc.md | 2 +- vllm/platforms/cpu.py | 8 +++++++ vllm/v1/worker/cpu_model_runner.py | 7 +++++- vllm/v1/worker/cpu_worker.py | 13 ++++++++-- 6 files changed, 48 insertions(+), 9 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index 8b8f0e8c6578d..0f734763f13fd 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -25,8 +25,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu . # Run the image, setting --shm-size=4g for tensor parallel. -docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" -docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2 +docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" +docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2 function cpu_tests() { set -e @@ -89,17 +89,33 @@ function cpu_tests() { pytest -x -s -v \ tests/lora/test_qwen2vl.py" - # online serving + # online serving: tp+pp docker exec cpu-test-"$NUMA_NODE" bash -c ' set -e VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 & + server_pid=$! timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1 vllm bench serve \ --backend vllm \ --dataset-name random \ --model meta-llama/Llama-3.2-3B-Instruct \ --num-prompts 20 \ - --endpoint /v1/completions' + --endpoint /v1/completions + kill -s SIGTERM $server_pid &' + + # online serving: tp+dp + docker exec cpu-test-"$NUMA_NODE" bash -c ' + set -e + VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 & + server_pid=$! + timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1 + vllm bench serve \ + --backend vllm \ + --dataset-name random \ + --model meta-llama/Llama-3.2-3B-Instruct \ + --num-prompts 20 \ + --endpoint /v1/completions + kill -s SIGTERM $server_pid &' } # All of CPU tests are expected to be finished less than 40 mins. diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md index e76ec35e1edcb..7f0ecb2bc0b74 100644 --- a/docs/getting_started/installation/cpu.md +++ b/docs/getting_started/installation/cpu.md @@ -96,6 +96,7 @@ Currently, there are no pre-built CPU wheels. - `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. Default value is `0`. - `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads, can be set as CPU id lists or `auto` (by default). For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. By setting to `auto`, the OpenMP threads of each rank are bound to the CPU cores in each NUMA node respectively. - `VLLM_CPU_NUM_OF_RESERVED_CPU`: specify the number of CPU cores which are not dedicated to the OpenMP threads for each rank. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. Default value is `None`. If the value is not set and use `auto` thread binding, no CPU will be reserved for `world_size == 1`, 1 CPU per rank will be reserved for `world_size > 1`. +- `CPU_VISIBLE_MEMORY_NODES`: specify visible NUMA memory nodes for vLLM CPU workers, similar to ```CUDA_VISIBLE_DEVICES```. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. The variable provides more control for the auto thread-binding feature, such as masking nodes and changing nodes binding sequence. - `VLLM_CPU_MOE_PREPACK` (x86 only): whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False). - `VLLM_CPU_SGL_KERNEL` (x86 only, Experimental): whether to use small-batch optimized kernels for linear layer and MoE layer, especially for low-latency requirements like online serving. The kernels require AMX instruction set, BFloat16 weight type and weight shapes divisible by 32. Default is `0` (False). @@ -179,7 +180,7 @@ Inference batch size is an important parameter for the performance. Larger batch - Offline Inference: `256 * world_size` - Online Serving: `128 * world_size` -vLLM CPU supports tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use TP and PP together if there are enough CPU sockets and memory nodes. +vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning DP, TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use DP, TP and PP together if there are enough CPU sockets and memory nodes. ### Which quantization configs does vLLM CPU support? diff --git a/docs/getting_started/installation/cpu/x86.inc.md b/docs/getting_started/installation/cpu/x86.inc.md index 6dc6f94249c34..f7af259ace628 100644 --- a/docs/getting_started/installation/cpu/x86.inc.md +++ b/docs/getting_started/installation/cpu/x86.inc.md @@ -43,7 +43,7 @@ docker build -f docker/Dockerfile.cpu \ # Launching OpenAI server docker run --rm \ - --privileged=true \ + --security-opt seccomp=unconfined \ --shm-size=4g \ -p 8000:8000 \ -e VLLM_CPU_KVCACHE_SPACE= \ diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 5686fae5cd7d1..12d5e0bf08652 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -69,6 +69,7 @@ class CpuPlatform(Platform): device_type: str = "cpu" dispatch_key: str = "CPU" dist_backend: str = "gloo" + device_control_env_var = "CPU_VISIBLE_MEMORY_NODES" @property def supported_dtypes(self) -> list[torch.dtype]: @@ -297,6 +298,13 @@ class CpuPlatform(Platform): allowed_numa_nodes.add(x.numa_node) # type: ignore allowed_numa_nodes_list = sorted(allowed_numa_nodes) + env_key = CpuPlatform.device_control_env_var + if (env_key in os.environ and os.environ[env_key] != ""): + visible_nodes = [int(s) for s in os.environ[env_key].split(',')] + allowed_numa_nodes_list = [ + x for x in visible_nodes if x in allowed_cpu_id_list + ] + return allowed_numa_nodes_list, logical_cpu_list @classmethod diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py index 7d0726112704a..226d7792a42f7 100644 --- a/vllm/v1/worker/cpu_model_runner.py +++ b/vllm/v1/worker/cpu_model_runner.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from contextlib import contextmanager -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Optional import torch import torch.nn as nn @@ -113,6 +113,11 @@ class CPUModelRunner(GPUModelRunner): def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]: return sampled_token_ids.tolist() + def get_dp_padding(self, + num_tokens: int) -> tuple[int, Optional[torch.Tensor]]: + # Note: For CPU backend, dp padding is not required for now. + return 0, None + @contextmanager def _torch_cuda_wrapper(): diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py index be78597926e09..b87c4fe09bb90 100644 --- a/vllm/v1/worker/cpu_worker.py +++ b/vllm/v1/worker/cpu_worker.py @@ -55,7 +55,14 @@ class CPUWorker(Worker): else: self.local_omp_cpuid = "all" else: - self.local_omp_cpuid = omp_cpuids.split("|")[self.rank] + local_dp_rank = self.parallel_config.data_parallel_rank_local + omp_cpuids = omp_cpuids.split("|") + if local_dp_rank is not None: + world_size = self.parallel_config.world_size + omp_cpuids = omp_cpuids[local_dp_rank * + world_size:(local_dp_rank + 1) * + world_size] + self.local_omp_cpuid = omp_cpuids[self.rank] if self.local_omp_cpuid != "all": ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid) @@ -162,7 +169,9 @@ class CPUWorker(Worker): # Reserve CPUs for other processes reserve_cpu_num = envs.VLLM_CPU_NUM_OF_RESERVED_CPU if reserve_cpu_num is None: - reserve_cpu_num = 1 if self.parallel_config.world_size > 1 else 0 + need_reserve = (self.parallel_config.world_size > 1 or + self.parallel_config.data_parallel_size_local > 1) + reserve_cpu_num = 1 if need_reserve else 0 assert len(logical_cpu_list) > reserve_cpu_num, ( f"VLLM_CPU_NUM_OF_RESERVED_CPU ({reserve_cpu_num}) " f"should less than {len(logical_cpu_list)}.") From d9e00dbd1fcf9e4b6b0b42a228d7bb26175cbba4 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Fri, 29 Aug 2025 18:12:32 +0800 Subject: [PATCH 24/56] [Performance] V1 Classify Models E2E Performance Optimization (#23541) Signed-off-by: wang.yuqi --- tests/entrypoints/llm/test_classify.py | 6 ++ .../entrypoints/openai/test_classification.py | 30 +++++++++ vllm/entrypoints/openai/api_server.py | 6 +- vllm/model_executor/layers/pooler.py | 62 ++++++++++--------- vllm/v1/worker/gpu_model_runner.py | 15 +++-- 5 files changed, 81 insertions(+), 38 deletions(-) diff --git a/tests/entrypoints/llm/test_classify.py b/tests/entrypoints/llm/test_classify.py index 7c261a2a5794e..6c0c9cd015801 100644 --- a/tests/entrypoints/llm/test_classify.py +++ b/tests/entrypoints/llm/test_classify.py @@ -62,3 +62,9 @@ def test_encode_api(llm: LLM): err_msg = "pooling_task must be one of.+" with pytest.raises(ValueError, match=err_msg): llm.encode(prompts, use_tqdm=False) + + +def test_score_api(llm: LLM): + err_msg = "Score API is only enabled for num_labels == 1." + with pytest.raises(ValueError, match=err_msg): + llm.score("ping", "pong", use_tqdm=False) diff --git a/tests/entrypoints/openai/test_classification.py b/tests/entrypoints/openai/test_classification.py index 30078fe90257a..36c96d76c2e5f 100644 --- a/tests/entrypoints/openai/test_classification.py +++ b/tests/entrypoints/openai/test_classification.py @@ -226,3 +226,33 @@ def test_pooling(server: RemoteOpenAIServer, model_name: str): }, ) assert response.json()["error"]["type"] == "BadRequestError" + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +def test_score(server: RemoteOpenAIServer, model_name: str): + # score api is only enabled for num_labels == 1. + response = requests.post( + server.url_for("score"), + json={ + "model": model_name, + "text_1": "ping", + "text_2": "pong", + }, + ) + assert response.json()["error"]["type"] == "BadRequestError" + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +def test_rerank(server: RemoteOpenAIServer, model_name: str): + # rerank api is only enabled for num_labels == 1. + response = requests.post( + server.url_for("rerank"), + json={ + "model": model_name, + "query": "ping", + "documents": ["pong"], + }, + ) + assert response.json()["error"]["type"] == "BadRequestError" diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index a28d38729f9f0..ca7d1539ddb79 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1805,17 +1805,13 @@ async def init_app_state( request_logger=request_logger, log_error_stack=args.log_error_stack, ) if "classify" in supported_tasks else None - - enable_serving_reranking = ("classify" in supported_tasks and getattr( - model_config.hf_config, "num_labels", 0) == 1) state.openai_serving_scores = ServingScores( engine_client, model_config, state.openai_serving_models, request_logger=request_logger, log_error_stack=args.log_error_stack, - ) if ("embed" in supported_tasks or enable_serving_reranking) else None - + ) if ("embed" in supported_tasks or "score" in supported_tasks) else None state.openai_serving_tokenization = OpenAIServingTokenization( engine_client, model_config, diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index 66101e1a99243..62b3ee1abaca8 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -13,12 +13,15 @@ import torch.nn.functional as F from transformers import PretrainedConfig from vllm.config import ModelConfig, PoolerConfig +from vllm.logger import init_logger from vllm.pooling_params import PoolingParams from vllm.sequence import PoolerOutput, PoolingSequenceGroupOutput from vllm.tasks import PoolingTask from vllm.utils import current_stream, resolve_obj_by_qualname from vllm.v1.pool.metadata import PoolingCursor, PoolingMetadata +logger = init_logger(__name__) + PoolingFn = Callable[ [Union[torch.Tensor, list[torch.Tensor]], PoolingMetadata], Union[torch.Tensor, list[torch.Tensor]]] @@ -183,7 +186,7 @@ def get_cross_encoder_activation_function(config: PretrainedConfig): fn = resolve_obj_by_qualname(function_name)() return PoolerActivation.wraps(fn) - return PoolerScore() + return PoolerClassify() def build_output( @@ -371,24 +374,31 @@ class PoolerMultiLabelClassify(PoolerActivation): class PoolerClassify(PoolerActivation): + def __init__(self, *, static_num_labels: bool = True) -> None: + super().__init__() + + if static_num_labels: + from vllm.config import get_current_vllm_config + vllm_config = get_current_vllm_config() + self.num_labels = getattr(vllm_config.model_config.hf_config, + "num_labels", 0) + if self.num_labels == 0: + logger.warning("num_labels should be > 0 for classification" + "models, falling back to softmax. " + "Please check if the configuration is correct.") + else: + self.num_labels = None + def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor: - num_labels = pooled_data.shape[-1] + num_labels = (self.num_labels if self.num_labels is not None else + pooled_data.shape[-1]) + if num_labels < 2: return F.sigmoid(pooled_data.float()).to(pooled_data.dtype) return F.softmax(pooled_data.float(), dim=-1).to(pooled_data.dtype) -class PoolerScore(PoolerActivation): - - def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor: - num_labels = pooled_data.shape[-1] - if num_labels < 2: - return F.sigmoid(pooled_data.float()).to(pooled_data.dtype) - - return pooled_data - - class LambdaPoolerActivation(PoolerActivation): def __init__(self, fn: Callable[[torch.Tensor], torch.Tensor]): @@ -428,6 +438,10 @@ class EmbeddingPoolerHead(PoolerHead): def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor], pooling_metadata: PoolingMetadata): + if isinstance(pooled_data, list): + pooled_data = torch.stack(pooled_data) + # pooled_data shape: [batchsize, hidden_dimension] + # Apply ST projector if self.projector is not None: projector = cast(nn.Module, self.projector) @@ -437,17 +451,11 @@ class EmbeddingPoolerHead(PoolerHead): y = projector(x.to(torch.float32)) return y.to(orig_dtype) - if isinstance(pooled_data, torch.Tensor): - pooled_data = _proj(pooled_data) - else: - pooled_data = [_proj(t) for t in pooled_data] + pooled_data = _proj(pooled_data) + # pooled_data shape: [batchsize, embedding_dimension] pooling_params = get_pooling_params(pooling_metadata) - if isinstance(pooled_data, list): - pooled_data = torch.stack(pooled_data) - # pooled_data shape: [batchsize, embedding_dimension] - # for matryoshka representation dimensions_list = [ pooling_param.dimensions for pooling_param in pooling_params @@ -477,13 +485,14 @@ class EmbeddingPoolerHead(PoolerHead): for vecs, f in zip(pooled_data, flags) ] + # pooled_data shape: [batchsize, embedding_dimension] return pooled_data class RewardPoolerHead(PoolerHead): def __init__(self) -> None: - super().__init__(activation=PoolerClassify()) + super().__init__(activation=PoolerClassify(static_num_labels=False)) def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor], pooling_metadata: PoolingMetadata): @@ -637,19 +646,13 @@ class ClassifierPooler(Pooler): pooling_metadata: PoolingMetadata, ) -> PoolerOutput: pooled_data = self.pooling(hidden_states, pooling_metadata) - if isinstance(pooled_data, list): pooled_data = torch.stack(pooled_data) # pooled_data shape: [batchsize, hidden_size] if self.classifier is not None: - # apply classifier once on the full batch if possible - if isinstance(pooled_data, torch.Tensor): - pooled_data = self.classifier(pooled_data) - elif len({data.shape for data in pooled_data}) <= 1: - pooled_data = self.classifier(torch.stack(pooled_data)) - else: - pooled_data = [self.classifier(data) for data in pooled_data] + pooled_data = self.classifier(pooled_data) + # pooled_data shape: [batchsize, num_labels] pooling_params = get_pooling_params(pooling_metadata) flags = [p.activation for p in pooling_params] @@ -662,6 +665,7 @@ class ClassifierPooler(Pooler): for vecs, f in zip(pooled_data, flags) ] + # scores shape: [batchsize, num_labels] return build_output(scores) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 0250a4e19a027..c6d50c17f2b4d 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1248,10 +1248,17 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): and "encode" in supported_tasks): supported_tasks.remove("encode") - logger.info_once("Chunked prefill is not supported with " - "encode task which using ALL pooling. " - "Please turn off chunked prefill by " - "`--no-enable-chunked-prefill` before using it.") + logger.debug_once("Chunked prefill is not supported with " + "encode task which using ALL pooling. " + "Please turn off chunked prefill by " + "`--no-enable-chunked-prefill` before using it.") + + if "score" in supported_tasks: + num_labels = getattr(self.model_config.hf_config, "num_labels", 0) + if num_labels != 1: + supported_tasks.remove("score") + logger.debug_once( + "Score API is only enabled for num_labels == 1.") return supported_tasks From 69f46359dd5b36c1a059a0a8b729be1bd86394e8 Mon Sep 17 00:00:00 2001 From: Flora Feng <4florafeng@gmail.com> Date: Fri, 29 Aug 2025 03:36:57 -0700 Subject: [PATCH 25/56] [Multimodal] Consolidate mm inputs into MultiModalFeatureSpec (#23779) Signed-off-by: sfeng33 <4florafeng@gmail.com> --- tests/tokenization/test_detokenize.py | 2 -- tests/v1/core/test_kv_cache_utils.py | 22 +++++++----- tests/v1/core/test_prefix_caching.py | 22 +++++++----- tests/v1/core/test_scheduler.py | 26 +++++++------- tests/v1/core/utils.py | 30 ++++++++-------- tests/v1/engine/test_engine_core.py | 4 +-- tests/v1/engine/test_engine_core_client.py | 4 +-- .../v1/engine/test_fast_incdec_prefix_err.py | 18 +++++----- tests/v1/engine/test_output_processor.py | 30 ++++++---------- tests/v1/kv_connector/unit/utils.py | 4 +-- vllm/multimodal/cache.py | 16 +++++++-- vllm/multimodal/inputs.py | 23 +++++++++++++ vllm/v1/engine/__init__.py | 7 ++-- vllm/v1/engine/core.py | 16 ++++----- vllm/v1/engine/processor.py | 31 +++++++---------- vllm/v1/request.py | 34 ++++++------------- 16 files changed, 143 insertions(+), 146 deletions(-) diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index ccafc88461275..ea7ccfbb2b456 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -64,8 +64,6 @@ def _run_incremental_decode(tokenizer, request = EngineCoreRequest("", prompt_token_ids, None, - None, - None, params, None, None, diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index 47c74aff1e753..c4f927d69c2dd 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -7,7 +7,8 @@ import pytest import torch from vllm.config import ModelConfig, SchedulerConfig, VllmConfig -from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange +from vllm.multimodal.inputs import (MultiModalFeatureSpec, + MultiModalKwargsItem, PlaceholderRange) from vllm.sampling_params import SamplingParams from vllm.utils import GiB_bytes, sha256, sha256_cbor_64bit from vllm.v1.core.kv_cache_manager import KVCacheManager @@ -37,17 +38,20 @@ def make_request( mm_hashes: Optional[list[str]] = None, cache_salt: Optional[str] = None, ): - if mm_positions is None: - mm_kwargs = None - else: - mm_item = MultiModalKwargsItem.dummy("dummy_m") - mm_kwargs = [mm_item] * len(mm_positions) + mm_features = [] + if mm_positions is not None: + for j, position in enumerate(mm_positions): + identifier = mm_hashes[j] if mm_hashes else f"hash_{j}" + mm_feature = MultiModalFeatureSpec( + data=MultiModalKwargsItem.dummy("dummy_m"), + mm_position=position, + identifier=identifier, + modality="image") + mm_features.append(mm_feature) return Request(request_id=request_id, prompt_token_ids=prompt_token_ids, - multi_modal_kwargs=mm_kwargs, - multi_modal_hashes=mm_hashes, - multi_modal_placeholders=mm_positions, + mm_features=mm_features if mm_features else None, sampling_params=SamplingParams(max_tokens=17), pooling_params=None, eos_token_id=100, diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index 89824768ed909..e7a8f63702b30 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -9,7 +9,8 @@ import pytest import torch from vllm.distributed.kv_events import AllBlocksCleared, BlockRemoved -from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange +from vllm.multimodal.inputs import (MultiModalFeatureSpec, + MultiModalKwargsItem, PlaceholderRange) from vllm.sampling_params import SamplingParams from vllm.utils import sha256, sha256_cbor_64bit from vllm.v1.core.block_pool import BlockPool @@ -32,17 +33,20 @@ def make_request( prompt_logprobs: Optional[int] = None, cache_salt: Optional[str] = None, ): - if mm_positions is None: - mm_kwargs = None - else: - mm_item = MultiModalKwargsItem.dummy("dummy_m") - mm_kwargs = [mm_item] * len(mm_positions) + mm_features = [] + if mm_positions is not None: + for j, position in enumerate(mm_positions): + identifier = mm_hashes[j] if mm_hashes else f"hash_{j}" + mm_feature = MultiModalFeatureSpec( + data=MultiModalKwargsItem.dummy("dummy_m"), + mm_position=position, + identifier=identifier, + modality="image") + mm_features.append(mm_feature) return Request(request_id=request_id, prompt_token_ids=prompt_token_ids, - multi_modal_kwargs=mm_kwargs, - multi_modal_hashes=mm_hashes, - multi_modal_placeholders=mm_positions, + mm_features=mm_features if mm_features else None, sampling_params=SamplingParams( max_tokens=17, prompt_logprobs=prompt_logprobs), pooling_params=None, diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 70e8691788045..572d6c9c889f6 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -8,7 +8,8 @@ import torch from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig, SchedulerConfig, SpeculativeConfig, VllmConfig) -from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange +from vllm.multimodal.inputs import (MultiModalFeatureSpec, + MultiModalKwargsItem, PlaceholderRange) from vllm.sampling_params import GuidedDecodingParams, SamplingParams from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput from vllm.v1.core.sched.scheduler import Scheduler @@ -1308,21 +1309,24 @@ def create_requests_with_priority( prompt_logprobs=prompt_logprobs) requests = [] for i in range(num_requests): + mm_features = [] if mm_positions is not None: mm_position = mm_positions[i] - mm_item = MultiModalKwargsItem.dummy("dummy_m") - mm_kwargs = [mm_item] * len(mm_position) - else: - mm_position = None - mm_kwargs = None + for j, position in enumerate(mm_position): + identifier = f"hash{i}_{j}" + mm_feature = MultiModalFeatureSpec( + data=MultiModalKwargsItem.dummy("dummy_m"), + mm_position=position, + identifier=identifier, + modality="image") + mm_features.append(mm_feature) + request = Request( request_id=f"{i + starting_idx}", prompt_token_ids=[i + starting_idx] * num_tokens, sampling_params=sampling_params, pooling_params=None, - multi_modal_kwargs=mm_kwargs, - multi_modal_placeholders=mm_position, - multi_modal_hashes=None, + mm_features=mm_features if mm_features else None, eos_token_id=EOS_TOKEN_ID, arrival_time=arrival_times[i], priority=priorities[i], @@ -1801,9 +1805,7 @@ def test_schedule_skip_tokenizer_init_structured_output_request(): request = Request( request_id="0", prompt_token_ids=[0, 1], - multi_modal_kwargs=None, - multi_modal_hashes=None, - multi_modal_placeholders=None, + mm_features=None, sampling_params=sampling_params, pooling_params=None, eos_token_id=EOS_TOKEN_ID, diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py index 78a71f10a5940..e392c2c336e9b 100644 --- a/tests/v1/core/utils.py +++ b/tests/v1/core/utils.py @@ -6,7 +6,8 @@ import torch from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig, SchedulerConfig, SpeculativeConfig, VllmConfig) -from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange +from vllm.multimodal.inputs import (MultiModalFeatureSpec, + MultiModalKwargsItem, PlaceholderRange) from vllm.sampling_params import SamplingParams from vllm.v1.core.kv_cache_utils import (get_request_block_hasher, init_none_hash) @@ -139,19 +140,20 @@ def create_requests( prompt_logprobs=prompt_logprobs) requests = [] for i in range(num_requests): + mm_features = [] if mm_positions is not None: mm_position = mm_positions[i] - mm_item = MultiModalKwargsItem.dummy("dummy_m") - mm_kwargs = [mm_item] * len(mm_position) - # Dummy hash for each mm item should be unique - # since encoder cache tracks entries by hash - mm_hashes = [ - "hash" + str(i) + "_" + str(j) for j in range(len(mm_position)) - ] - else: - mm_position = None - mm_kwargs = None - mm_hashes = None + for j, position in enumerate(mm_position): + # Dummy hash for each mm item should be unique + # since encoder cache tracks entries by hash + identifier = f"hash{i}_{j}" + mm_feature = MultiModalFeatureSpec( + data=MultiModalKwargsItem.dummy("dummy_m"), + mm_position=position, + identifier=identifier, + modality="image") + mm_features.append(mm_feature) + prompt_token_ids = ([0] * num_tokens if same_prompt else [i] * num_tokens) request = Request( @@ -159,9 +161,7 @@ def create_requests( prompt_token_ids=prompt_token_ids, sampling_params=sampling_params, pooling_params=None, - multi_modal_kwargs=mm_kwargs, - multi_modal_placeholders=mm_position, - multi_modal_hashes=mm_hashes, + mm_features=mm_features if mm_features else None, eos_token_id=EOS_TOKEN_ID, block_hasher=block_hasher, ) diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 2ea957a3e230f..e6f7ebf25970f 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -35,9 +35,7 @@ def make_request() -> EngineCoreRequest: return EngineCoreRequest( request_id=str(uuid.uuid4()), prompt_token_ids=PROMPT_TOKENS, - mm_kwargs=None, - mm_hashes=None, - mm_placeholders=None, + mm_features=None, sampling_params=SamplingParams(), pooling_params=None, eos_token_id=None, diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index 37eb869fe69a3..625a3470e8025 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -52,9 +52,7 @@ def make_request( return EngineCoreRequest( request_id=str(uuid.uuid4()), prompt_token_ids=prompt_tokens_ids, - mm_kwargs=None, - mm_hashes=None, - mm_placeholders=None, + mm_features=None, sampling_params=params, pooling_params=None, eos_token_id=None, diff --git a/tests/v1/engine/test_fast_incdec_prefix_err.py b/tests/v1/engine/test_fast_incdec_prefix_err.py index f028b4ab1d73f..f3d8e13088b03 100644 --- a/tests/v1/engine/test_fast_incdec_prefix_err.py +++ b/tests/v1/engine/test_fast_incdec_prefix_err.py @@ -26,16 +26,14 @@ def test_fast_inc_detok_invalid_utf8_err_case(): prompt_token_ids = [107, 4606, 236787, 107] params = SamplingParams(skip_special_tokens=True) request = EngineCoreRequest( - "test", - prompt_token_ids, - None, - None, - None, - params, - None, - None, - 0.0, - None, + request_id="test", + prompt_token_ids=prompt_token_ids, + mm_features=None, + sampling_params=params, + pooling_params=None, + eos_token_id=None, + arrival_time=0.0, + lora_request=None, cache_salt=None, data_parallel_rank=None, ) diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py index c113439a70228..6544e8b017e70 100644 --- a/tests/v1/engine/test_output_processor.py +++ b/tests/v1/engine/test_output_processor.py @@ -52,11 +52,9 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind, requests = [ EngineCoreRequest(request_id=f"request-{idx}", prompt_token_ids=prompt_tokens, - arrival_time=0, - mm_kwargs=None, - mm_hashes=None, - mm_placeholders=None, + mm_features=None, eos_token_id=None, + arrival_time=0, lora_request=None, cache_salt=None, data_parallel_rank=None, @@ -401,11 +399,9 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind, requests = [ EngineCoreRequest(request_id=request_id_list[idx], prompt_token_ids=prompt_tokens, - arrival_time=0, - mm_kwargs=None, - mm_hashes=None, - mm_placeholders=None, + mm_features=None, eos_token_id=None, + arrival_time=0, lora_request=None, cache_salt=None, data_parallel_rank=None, @@ -566,11 +562,9 @@ def test_stop_token(include_stop_str_in_output: bool, request = EngineCoreRequest( request_id=request_id, prompt_token_ids=prompt_tokens, - arrival_time=0, - mm_kwargs=None, - mm_hashes=None, - mm_placeholders=None, + mm_features=None, eos_token_id=eos_token_id, + arrival_time=0, lora_request=None, cache_salt=None, data_parallel_rank=None, @@ -665,11 +659,9 @@ def test_stop_string(include_stop_str_in_output: bool, EngineCoreRequest( request_id=request_id_list[idx], prompt_token_ids=prompt_tokens, - arrival_time=0, - mm_kwargs=None, - mm_hashes=None, - mm_placeholders=None, + mm_features=None, eos_token_id=None, + arrival_time=0, lora_request=None, cache_salt=None, data_parallel_rank=None, @@ -781,11 +773,9 @@ def test_iteration_stats(dummy_test_vectors): EngineCoreRequest( request_id=f"request-{idx}", prompt_token_ids=prompt_tokens, - arrival_time=0, - mm_kwargs=None, - mm_hashes=None, - mm_placeholders=None, + mm_features=None, eos_token_id=None, + arrival_time=0, lora_request=None, cache_salt=None, data_parallel_rank=None, diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index a47f583b329e2..3f068d5e8c7eb 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -162,9 +162,7 @@ def create_request(request_id: int, prompt_token_ids=prompt_token_ids, sampling_params=sampling_params, pooling_params=None, - multi_modal_kwargs=None, - multi_modal_placeholders=None, - multi_modal_hashes=None, + mm_features=None, eos_token_id=EOS_TOKEN_ID, block_hasher=get_request_block_hasher(block_size, hash_fn), ) diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py index 0e81cb6d4d190..d385fcf61c9fa 100644 --- a/vllm/multimodal/cache.py +++ b/vllm/multimodal/cache.py @@ -12,9 +12,9 @@ from vllm.logger import init_logger from vllm.utils import GiB_bytes, LRUCache from vllm.utils.jsontree import json_map_leaves, json_reduce_leaves -from .inputs import (MultiModalFieldElem, MultiModalKwargs, - MultiModalKwargsItem, MultiModalKwargsItems, - NestedTensors) +from .inputs import (MultiModalFeatureSpec, MultiModalFieldElem, + MultiModalKwargs, MultiModalKwargsItem, + MultiModalKwargsItems, NestedTensors) if TYPE_CHECKING: from vllm.config import ModelConfig, VllmConfig @@ -418,6 +418,16 @@ class BaseMultiModalReceiverCache( MultiModalKwargsItem]): """The required interface for caches on P1.""" + def get_and_update_features( + self, + mm_features: list["MultiModalFeatureSpec"], + ) -> list["MultiModalFeatureSpec"]: + """Update multimodal features with cached encoder outputs.""" + for feature in mm_features: + feature.data = self.get_and_update_item(feature.data, + feature.identifier) + return mm_features + class MultiModalReceiverCache(BaseMultiModalReceiverCache): """ diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index cf6ab6c8dea2d..6fcc5bc772146 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -198,6 +198,29 @@ A dictionary containing nested tensors which have been batched via """ +@dataclass +class MultiModalFeatureSpec: + """ + Represents a single multimodal input with its processed data and metadata. + + Used by the V1 engine to track multimodal data through processing and + caching. A request containing multiple multimodal items will have one + MultiModalFeatureSpec per item. + """ + + data: Optional["MultiModalKwargsItem"] + """Multimodal data for this feature""" + + modality: str + """Based on the input, e.g., "image", "audio", "video".""" + + identifier: str + """mm_hash or uuid for caching encoder outputs.""" + + mm_position: PlaceholderRange + """e.g., PlaceholderRange(offset=2, length=336)""" + + @dataclass class MultiModalFieldElem: """ diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index f7ec982db41b4..5d8959a3cd3fe 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -3,14 +3,13 @@ import enum import time -from collections.abc import Sequence from typing import Any, Optional, Union import msgspec import torch from vllm.lora.request import LoRARequest -from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange +from vllm.multimodal.inputs import MultiModalFeatureSpec from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.v1.metrics.stats import SchedulerStats @@ -48,9 +47,7 @@ class EngineCoreRequest( request_id: str prompt_token_ids: list[int] - mm_kwargs: Optional[Sequence[Optional[MultiModalKwargsItem]]] - mm_hashes: Optional[list[str]] - mm_placeholders: Optional[list[PlaceholderRange]] + mm_features: Optional[list[MultiModalFeatureSpec]] sampling_params: Optional[SamplingParams] pooling_params: Optional[PoolingParams] eos_token_id: Optional[int] diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index a7038e2d2c264..785cbc9d8d565 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -434,15 +434,13 @@ class EngineCore: This function could be directly used in input processing thread to allow request initialization running in parallel with Model forward """ - if request.mm_hashes is not None: - assert request.mm_kwargs is not None - - # Note on thread safety: no race condition. - # `mm_receiver_cache` is reset at the end of LLMEngine init, - # and will only accessed in the input processing thread afterwards. - if self.mm_receiver_cache is not None: - request.mm_kwargs = self.mm_receiver_cache.get_and_update( - request.mm_kwargs, request.mm_hashes) + # Note on thread safety: no race condition. + # `mm_receiver_cache` is reset at the end of LLMEngine init, + # and will only accessed in the input processing thread afterwards. + if self.mm_receiver_cache is not None and request.mm_features: + request.mm_features = ( + self.mm_receiver_cache.get_and_update_features( + request.mm_features)) req = Request.from_engine_core_request(request, self.request_block_hasher) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index df915258d8637..6cff95c393444 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -12,7 +12,7 @@ from vllm.inputs.preprocess import InputPreprocessor from vllm.lora.request import LoRARequest from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.multimodal.cache import processor_cache_from_config -from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange +from vllm.multimodal.inputs import MultiModalFeatureSpec from vllm.multimodal.processing import EncDecMultiModalProcessor from vllm.multimodal.utils import argsort_mm_positions from vllm.pooling_params import PoolingParams @@ -346,9 +346,8 @@ class Processor: pooling_params = params.clone() # Multimodal related. - sorted_mm_inputs: Optional[list[Optional[MultiModalKwargsItem]]] = None - sorted_mm_positions: Optional[list[PlaceholderRange]] = None - sorted_mm_hashes: Optional[list[str]] = None + mm_features: Optional[list[MultiModalFeatureSpec]] = None + if decoder_inputs["type"] == "multimodal": decoder_mm_inputs = decoder_inputs["mm_kwargs"] decoder_mm_positions = decoder_inputs["mm_placeholders"] @@ -359,25 +358,19 @@ class Processor: # in the input sequence. sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions) - sorted_mm_inputs = [ - decoder_mm_inputs[modality][idx] - for modality, idx in sorted_mm_idxs - ] - sorted_mm_positions = [ - decoder_mm_positions[modality][idx] - for modality, idx in sorted_mm_idxs - ] - sorted_mm_hashes = [ - decoder_mm_hashes[modality][idx] - for modality, idx in sorted_mm_idxs - ] + mm_features = [] + for modality, idx in sorted_mm_idxs: + mm_features.append( + MultiModalFeatureSpec( + data=decoder_mm_inputs[modality][idx], + modality=modality, + identifier=decoder_mm_hashes[modality][idx], + mm_position=decoder_mm_positions[modality][idx])) return decoder_inputs.get("prompt"), EngineCoreRequest( request_id=request_id, prompt_token_ids=decoder_inputs["prompt_token_ids"], - mm_kwargs=sorted_mm_inputs, - mm_hashes=sorted_mm_hashes, - mm_placeholders=sorted_mm_positions, + mm_features=mm_features, sampling_params=sampling_params, pooling_params=pooling_params, eos_token_id=eos_token_id, diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 4e99a9ccef46e..ad7477241ebbd 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -6,10 +6,9 @@ import time from functools import partial from typing import TYPE_CHECKING, Any, Callable, Optional, Union -from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange +from vllm.multimodal.inputs import MultiModalFeatureSpec from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams -from vllm.utils import is_list_of from vllm.v1.engine import (EngineCoreEvent, EngineCoreEventType, EngineCoreRequest, FinishReason) from vllm.v1.structured_output.request import StructuredOutputRequest @@ -26,14 +25,12 @@ class Request: self, request_id: str, prompt_token_ids: list[int], - multi_modal_kwargs: Optional[list[MultiModalKwargsItem]], - multi_modal_hashes: Optional[list[str]], - multi_modal_placeholders: Optional[list[PlaceholderRange]], sampling_params: Optional[SamplingParams], pooling_params: Optional[PoolingParams], eos_token_id: Optional[int], client_index: int = 0, arrival_time: Optional[float] = None, + mm_features: Optional[list[MultiModalFeatureSpec]] = None, lora_request: Optional["LoRARequest"] = None, structured_output_request: Optional["StructuredOutputRequest"] = None, cache_salt: Optional[str] = None, @@ -89,16 +86,14 @@ class Request: self.cache_salt: Optional[str] = cache_salt # Multi-modal related - self.mm_positions = multi_modal_placeholders or [] - self.mm_kwargs = multi_modal_kwargs or [] - self.mm_hashes: list[str] = multi_modal_hashes or [] - self.num_encoder_inputs = len(self.mm_kwargs) + self.mm_features = mm_features or [] + self.num_encoder_inputs = len(self.mm_features) self.has_encoder_inputs = self.num_encoder_inputs > 0 - - # Sanity check - assert len(self.mm_kwargs) == len(self.mm_positions) - if self.mm_hashes: - assert len(self.mm_kwargs) == len(self.mm_hashes) + # TODO(sfeng33): Remove these legacy fields after clearing out all + # references in scheduler and model runner + self.mm_positions = [f.mm_position for f in self.mm_features] + self.mm_kwargs = [f.data for f in self.mm_features] + self.mm_hashes = [f.identifier for f in self.mm_features] # Read-only views # Prevent directly appending to these lists since @@ -126,20 +121,11 @@ class Request: cls, request: EngineCoreRequest, block_hasher: Optional[Callable[["Request"], list["BlockHash"]]] ) -> "Request": - if request.mm_kwargs is not None: - mm_kwargs_lst = list(request.mm_kwargs) - assert is_list_of(mm_kwargs_lst, MultiModalKwargsItem), ( - "mm_kwargs was not updated in EngineCore.add_request") - else: - mm_kwargs_lst = None - return cls( request_id=request.request_id, client_index=request.client_index, prompt_token_ids=request.prompt_token_ids, - multi_modal_kwargs=mm_kwargs_lst, - multi_modal_hashes=request.mm_hashes, - multi_modal_placeholders=request.mm_placeholders, + mm_features=request.mm_features, sampling_params=request.sampling_params, pooling_params=request.pooling_params, eos_token_id=request.eos_token_id, From 67c14906aaa480d4fee2606f31c784ae21f8a633 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Fri, 29 Aug 2025 03:57:35 -0700 Subject: [PATCH 26/56] Update PyTorch to 2.8.0 (#20358) Signed-off-by: Huy Do Co-authored-by: Michael Goin --- .buildkite/test-pipeline.yaml | 4 +-- CMakeLists.txt | 4 +-- pyproject.toml | 2 +- requirements/build.txt | 3 +- requirements/cpu.txt | 9 +++-- requirements/cuda.txt | 10 +++--- requirements/rocm-build.txt | 8 ++--- requirements/test.in | 6 ++-- requirements/test.txt | 36 ++++++++++---------- tests/distributed/test_sequence_parallel.py | 2 +- tests/lora/test_chatglm3_tp.py | 6 +++- vllm/v1/attention/backends/flex_attention.py | 5 +-- 12 files changed, 50 insertions(+), 45 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index f2652045526b2..482808cd07e8c 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -462,8 +462,8 @@ steps: - tests/quantization commands: # temporary install here since we need nightly, will move to requirements/test.in - # after torchao 0.12 release - - pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 + # after torchao 0.12 release, and pin a working version of torchao nightly here + - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128 - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization - label: LM Eval Small Models # 53min diff --git a/CMakeLists.txt b/CMakeLists.txt index e92e08f0d0ecd..3f1f9a781a07a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -45,8 +45,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1 # requirements.txt files and should be kept consistent. The ROCm torch # versions are derived from docker/Dockerfile.rocm # -set(TORCH_SUPPORTED_VERSION_CUDA "2.7.1") -set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0") +set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0") +set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0") # # Try to find python package with an executable that exactly matches diff --git a/pyproject.toml b/pyproject.toml index 013f2a6cd59e4..e63f8aeae2787 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "packaging>=24.2", "setuptools>=77.0.3,<80.0.0", "setuptools-scm>=8.0", - "torch == 2.7.1", + "torch == 2.8.0", "wheel", "jinja2", ] diff --git a/requirements/build.txt b/requirements/build.txt index dd644d621efc1..5f826a1afa144 100644 --- a/requirements/build.txt +++ b/requirements/build.txt @@ -4,7 +4,8 @@ ninja packaging>=24.2 setuptools>=77.0.3,<80.0.0 setuptools-scm>=8 -torch==2.7.1 +torch==2.8.0 wheel jinja2>=3.1.6 regex +build diff --git a/requirements/cpu.txt b/requirements/cpu.txt index f4b95b72898cc..a48cb9fde000c 100644 --- a/requirements/cpu.txt +++ b/requirements/cpu.txt @@ -9,17 +9,16 @@ packaging>=24.2 setuptools>=77.0.3,<80.0.0 --extra-index-url https://download.pytorch.org/whl/cpu torch==2.6.0+cpu; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218 -torch==2.7.0; platform_system == "Darwin" -torch==2.7.0; platform_machine == "ppc64le" -torch==2.6.0; platform_machine == "aarch64" # for arm64 CPUs, torch 2.7.0 has a issue: https://github.com/vllm-project/vllm/issues/17960 +torch==2.8.0; platform_system == "Darwin" +torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64" # required for the image processor of minicpm-o-2_6, this must be updated alongside torch torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x" -torchaudio==2.7.0; platform_machine == "ppc64le" +torchaudio==2.8.0; platform_machine == "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch torchvision; platform_machine != "ppc64le" and platform_machine != "s390x" -torchvision==0.22.0; platform_machine == "ppc64le" +torchvision==0.23.0; platform_machine == "ppc64le" datasets # for benchmark scripts # Intel Extension for PyTorch, only for x86_64 CPUs diff --git a/requirements/cuda.txt b/requirements/cuda.txt index fb30e493f80b3..3f8b8fca3209a 100644 --- a/requirements/cuda.txt +++ b/requirements/cuda.txt @@ -6,9 +6,9 @@ numba == 0.61.2; python_version > '3.9' # Dependencies for NVIDIA GPUs ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1. -torch==2.7.1 -torchaudio==2.7.1 +torch==2.8.0 +torchaudio==2.8.0 # These must be updated alongside torch -torchvision==0.22.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version -# https://github.com/facebookresearch/xformers/releases/tag/v0.0.31 -xformers==0.0.31; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.7 \ No newline at end of file +torchvision==0.23.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version +# https://github.com/facebookresearch/xformers/releases/tag/v0.0.32.post1 +xformers==0.0.32.post1; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.8 diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt index cbae9bbb8a9b3..affe562c24f6b 100644 --- a/requirements/rocm-build.txt +++ b/requirements/rocm-build.txt @@ -1,10 +1,10 @@ # Common dependencies -r common.txt ---extra-index-url https://download.pytorch.org/whl/rocm6.2.4 -torch==2.7.0 -torchvision==0.22.0 -torchaudio==2.7.0 +--extra-index-url https://download.pytorch.org/whl/rocm6.3 +torch==2.8.0 +torchvision==0.23.0 +torchaudio==2.8.0 triton==3.3.0 cmake>=3.26.1,<4 diff --git a/requirements/test.in b/requirements/test.in index 92c577c501632..5b1688c76c954 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -22,9 +22,9 @@ sentence-transformers # required for embedding tests soundfile # required for audio tests jiwer # required for audio tests timm >=1.0.17 # required for internvl and gemma3n-mm test -torch==2.7.1 -torchaudio==2.7.1 -torchvision==0.22.1 +torch==2.8.0 +torchaudio==2.8.0 +torchvision==0.23.0 transformers_stream_generator # required for qwen-vl test matplotlib # required for qwen-vl test mistral_common[image,audio] >= 1.8.2 # required for voxtral test diff --git a/requirements/test.txt b/requirements/test.txt index 0c27c9bb67e82..0b728ebfb0071 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -541,42 +541,42 @@ numpy==1.26.4 # tritonclient # vocos # xarray -nvidia-cublas-cu12==12.8.3.14 +nvidia-cublas-cu12==12.8.4.1 # via # nvidia-cudnn-cu12 # nvidia-cusolver-cu12 # torch -nvidia-cuda-cupti-cu12==12.8.57 +nvidia-cuda-cupti-cu12==12.8.90 # via torch -nvidia-cuda-nvrtc-cu12==12.8.61 +nvidia-cuda-nvrtc-cu12==12.8.93 # via torch -nvidia-cuda-runtime-cu12==12.8.57 +nvidia-cuda-runtime-cu12==12.8.90 # via torch -nvidia-cudnn-cu12==9.7.1.26 +nvidia-cudnn-cu12==9.10.2.21 # via torch -nvidia-cufft-cu12==11.3.3.41 +nvidia-cufft-cu12==11.3.3.83 # via torch -nvidia-cufile-cu12==1.13.0.11 +nvidia-cufile-cu12==1.13.1.3 # via torch -nvidia-curand-cu12==10.3.9.55 +nvidia-curand-cu12==10.3.9.90 # via torch -nvidia-cusolver-cu12==11.7.2.55 +nvidia-cusolver-cu12==11.7.3.90 # via torch -nvidia-cusparse-cu12==12.5.7.53 +nvidia-cusparse-cu12==12.5.8.93 # via # nvidia-cusolver-cu12 # torch -nvidia-cusparselt-cu12==0.6.3 +nvidia-cusparselt-cu12==0.7.1 # via torch -nvidia-nccl-cu12==2.26.2 +nvidia-nccl-cu12==2.27.3 # via torch -nvidia-nvjitlink-cu12==12.8.61 +nvidia-nvjitlink-cu12==12.8.93 # via # nvidia-cufft-cu12 # nvidia-cusolver-cu12 # nvidia-cusparse-cu12 # torch -nvidia-nvtx-cu12==12.8.55 +nvidia-nvtx-cu12==12.8.90 # via torch omegaconf==2.3.0 # via @@ -1069,7 +1069,7 @@ tomli==2.2.1 # via schemathesis tomli-w==1.2.0 # via schemathesis -torch==2.7.1+cu128 +torch==2.8.0+cu128 # via # -r requirements/test.in # accelerate @@ -1098,7 +1098,7 @@ torch==2.7.1+cu128 # torchvision # vector-quantize-pytorch # vocos -torchaudio==2.7.1+cu128 +torchaudio==2.8.0+cu128 # via # -r requirements/test.in # encodec @@ -1111,7 +1111,7 @@ torchmetrics==1.7.4 # pytorch-lightning # terratorch # torchgeo -torchvision==0.22.1+cu128 +torchvision==0.23.0+cu128 # via # -r requirements/test.in # lightly @@ -1152,7 +1152,7 @@ transformers==4.55.2 # transformers-stream-generator transformers-stream-generator==0.0.5 # via -r requirements/test.in -triton==3.3.1 +triton==3.4.0 # via torch tritonclient==2.51.0 # via diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py index 49b8eddecb4a9..c93b436f384b9 100644 --- a/tests/distributed/test_sequence_parallel.py +++ b/tests/distributed/test_sequence_parallel.py @@ -292,7 +292,7 @@ SP_TEST_MODELS = [ # TODO support other models # [LANGUAGE GENERATION] "meta-llama/Llama-3.2-1B-Instruct", - "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8" + "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8", ] diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py index fb00e7b65b04a..5cffb8cfcc26d 100644 --- a/tests/lora/test_chatglm3_tp.py +++ b/tests/lora/test_chatglm3_tp.py @@ -87,6 +87,9 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files): @multi_gpu_test(num_gpus=4) @create_new_process_for_each_test() def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files): + # https://github.com/NVIDIA/nccl/issues/1790, set a lower value for + # gpu_memory_utilization here because NCCL >= 2.26.3 seems to use + # more GPU memory causing vLLM to OOM llm = vllm.LLM(MODEL_PATH, max_model_len=1024, enable_lora=True, @@ -95,7 +98,8 @@ def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files): tensor_parallel_size=4, trust_remote_code=True, fully_sharded_loras=True, - enable_chunked_prefill=True) + enable_chunked_prefill=True, + gpu_memory_utilization=0.85) output1 = do_sample(llm, chatglm3_lora_files, lora_id=1) for i in range(len(EXPECTED_LORA_OUTPUT)): assert output1[i] == EXPECTED_LORA_OUTPUT[i] diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py index a596f6b2b32a4..d5b1c15e68d0e 100644 --- a/vllm/v1/attention/backends/flex_attention.py +++ b/vllm/v1/attention/backends/flex_attention.py @@ -789,6 +789,7 @@ def get_kernel_options(query, block_m, block_n, device_props = torch.cuda.get_device_properties() max_shared_memory = device_props.shared_memory_per_block_optin if max_shared_memory < 144 * 1024: - kernel_options["BLOCK_M"] = 32 - kernel_options["BLOCK_N"] = 32 + kernel_options["BLOCK_M"] = kernel_options["BLOCK_M"] // 2 + kernel_options["BLOCK_N"] = kernel_options["BLOCK_N"] // 2 + return kernel_options From 4f7cde7272359d886d8dd178eebb19d94b3cdd6c Mon Sep 17 00:00:00 2001 From: Adit Chawdhary <25533953+aditchawdhary@users.noreply.github.com> Date: Fri, 29 Aug 2025 17:58:13 +0530 Subject: [PATCH 27/56] Adds `json_count_leaves` utility function (#23899) Signed-off-by: aditchawdhary --- tests/utils_/test_utils.py | 36 +++++++++++++++++++++++++++++++++--- vllm/multimodal/cache.py | 32 +++++++++++++++++++++++++++----- vllm/utils/jsontree.py | 14 ++++++++++++-- 3 files changed, 72 insertions(+), 10 deletions(-) diff --git a/tests/utils_/test_utils.py b/tests/utils_/test_utils.py index 04195ea0cf92e..66124dd854ee0 100644 --- a/tests/utils_/test_utils.py +++ b/tests/utils_/test_utils.py @@ -379,9 +379,9 @@ def test_duplicate_dict_args(caplog_vllm, parser): def test_supports_kw(callable,kw_name,requires_kw_only, allow_var_kwargs,is_supported): assert supports_kw( - callable=callable, - kw_name=kw_name, - requires_kw_only=requires_kw_only, + callable=callable, + kw_name=kw_name, + requires_kw_only=requires_kw_only, allow_var_kwargs=allow_var_kwargs ) == is_supported @@ -948,6 +948,36 @@ def test_join_host_port(): assert join_host_port("::1", 5555) == "[::1]:5555" +def test_json_count_leaves(): + """Test json_count_leaves function from jsontree utility.""" + from vllm.utils.jsontree import json_count_leaves + + # Single leaf values + assert json_count_leaves(42) == 1 + assert json_count_leaves("hello") == 1 + assert json_count_leaves(None) == 1 + + # Empty containers + assert json_count_leaves([]) == 0 + assert json_count_leaves({}) == 0 + assert json_count_leaves(()) == 0 + + # Flat structures + assert json_count_leaves([1, 2, 3]) == 3 + assert json_count_leaves({"a": 1, "b": 2}) == 2 + assert json_count_leaves((1, 2, 3)) == 3 + + # Nested structures + nested_dict = {"a": 1, "b": {"c": 2, "d": 3}} + assert json_count_leaves(nested_dict) == 3 + + nested_list = [1, [2, 3], 4] + assert json_count_leaves(nested_list) == 4 + + mixed_nested = {"list": [1, 2], "dict": {"x": 3}, "value": 4} + assert json_count_leaves(mixed_nested) == 4 + + def test_convert_ids_list_to_tokens(): tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct") token_ids = tokenizer.encode("Hello, world!") diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py index d385fcf61c9fa..35b743ed21d92 100644 --- a/vllm/multimodal/cache.py +++ b/vllm/multimodal/cache.py @@ -10,7 +10,8 @@ from typing_extensions import TypeAlias, override from vllm.logger import init_logger from vllm.utils import GiB_bytes, LRUCache -from vllm.utils.jsontree import json_map_leaves, json_reduce_leaves +from vllm.utils.jsontree import (json_count_leaves, json_map_leaves, + json_reduce_leaves) from .inputs import (MultiModalFeatureSpec, MultiModalFieldElem, MultiModalKwargs, MultiModalKwargsItem, @@ -127,11 +128,32 @@ class MultiModalCache: ) if debug: - logger.debug("Calculated size of %s to be %.2f GiB", type(value), - size / GiB_bytes) + leaf_count = json_count_leaves(value) + logger.debug( + "Calculated size of %s to be %.2f GiB (%d leaves)", + type(value), + size / GiB_bytes, + leaf_count, + ) return size + @classmethod + def get_item_complexity(cls, value: MultiModalCacheValue) -> int: + """ + Get the number of leaf elements in a multi-modal cache value. + + This provides a measure of structural complexity that can be useful + for debugging cache performance and understanding data patterns. + + Args: + value: The multi-modal cache value to analyze. + + Returns: + The number of leaf elements in the nested structure. + """ + return json_count_leaves(value) + @classmethod def get_lru_cache( cls, @@ -184,7 +206,7 @@ class BaseMultiModalCache(ABC, Generic[_I, _O]): """ Possibly update a multi-modal item based on whether it is in the underlying cache. - + This update is done out-of-place and updates the cache eviction order. Args: @@ -262,7 +284,7 @@ class BaseMultiModalProcessorCache( in the underlying cache. This **DOES NOT** update the cache eviction order. - + Args: mm_hashes: The hash of each item to check. diff --git a/vllm/utils/jsontree.py b/vllm/utils/jsontree.py index 4cbe0f76e0067..457afb7e2c6ff 100644 --- a/vllm/utils/jsontree.py +++ b/vllm/utils/jsontree.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Helper functions to work with nested JSON structures.""" + from collections.abc import Iterable from functools import reduce from typing import Callable, TypeVar, Union, overload @@ -8,8 +9,12 @@ from typing import Callable, TypeVar, Union, overload _T = TypeVar("_T") _U = TypeVar("_U") -JSONTree = Union[dict[str, "JSONTree[_T]"], list["JSONTree[_T]"], - tuple["JSONTree[_T]", ...], _T] +JSONTree = Union[ + dict[str, "JSONTree[_T]"], + list["JSONTree[_T]"], + tuple["JSONTree[_T]", ...], + _T, +] """A nested JSON structure where the leaves need not be JSON-serializable.""" @@ -78,3 +83,8 @@ def json_reduce_leaves( json_iter_leaves(value), initial, ) + + +def json_count_leaves(value: JSONTree[_T]) -> int: + """Count the number of leaves in a nested JSON structure.""" + return sum(1 for _ in json_iter_leaves(value)) From 1cf3753b901ba874a830c19555bb31fe37f91231 Mon Sep 17 00:00:00 2001 From: EduardDurech <39579228+EduardDurech@users.noreply.github.com> Date: Fri, 29 Aug 2025 14:29:18 +0200 Subject: [PATCH 28/56] [MODEL] `Apertus` and `XIELU` (#23068) Signed-off-by: EduardDurech <39579228+EduardDurech@users.noreply.github.com> Co-authored-by: AllenHaoHuang --- .../models/language/generation/test_common.py | 3 +- tests/models/registry.py | 3 + tests/models/test_registry.py | 3 + vllm/model_executor/layers/activation.py | 111 ++++ vllm/model_executor/models/apertus.py | 576 ++++++++++++++++++ vllm/model_executor/models/registry.py | 1 + 6 files changed, 696 insertions(+), 1 deletion(-) create mode 100644 vllm/model_executor/models/apertus.py diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py index 57382914bfea8..4c4434c94145a 100644 --- a/tests/models/language/generation/test_common.py +++ b/tests/models/language/generation/test_common.py @@ -92,7 +92,8 @@ AITER_MODEL_LIST = [ pytest.param( "allenai/OLMoE-1B-7B-0924-Instruct", marks=[pytest.mark.cpu_model], - ) + ), + pytest.param("swiss-ai/Apertus-8B"), # apertus ]) @pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("num_logprobs", [5]) diff --git a/tests/models/registry.py b/tests/models/registry.py index 13eb4872e7d84..a37ffdc311514 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -137,6 +137,9 @@ class _HfExamplesInfo: # yapf: disable _TEXT_GENERATION_EXAMPLE_MODELS = { # [Decoder-only] + "ApertusForCausalLM": _HfExamplesInfo("swiss-ai/Apertus-8B", + min_transformers_version="4.56.0", + trust_remote_code=True), "AquilaModel": _HfExamplesInfo("BAAI/AquilaChat-7B", trust_remote_code=True), "AquilaForCausalLM": _HfExamplesInfo("BAAI/AquilaChat2-7B", diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py index 8769ad45eb93e..36882aba5e941 100644 --- a/tests/models/test_registry.py +++ b/tests/models/test_registry.py @@ -24,6 +24,9 @@ from .registry import HF_EXAMPLE_MODELS @pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs()) def test_registry_imports(model_arch): + # Skip if transformers version is incompatible + model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch) + model_info.check_transformers_version(on_fail="skip") # Ensure all model classes can be imported successfully model_cls = ModelRegistry._try_load_model_cls(model_arch) assert model_cls is not None diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index f3248589abc47..eb7e494e32861 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -10,11 +10,14 @@ import torch.nn.functional as F from vllm.distributed import (divide, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) +from vllm.logger import init_logger from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.utils import LazyDict +logger = init_logger(__name__) + @CustomOp.register("fatrelu_and_mul") class FatreluAndMul(CustomOp): @@ -363,6 +366,112 @@ class ReLUSquaredActivation(CustomOp): return self.forward_native(x) +@CustomOp.register("xielu") +class XIELU(CustomOp): + """ + Applies the xIELU activation function introduced in https://arxiv.org/abs/2411.13010 + If the user has installed the nickjbrowning/XIELU, we import xIELU CUDA + Otherwise, we emit a single warning and use xIELU Python + """ + + def __init__( + self, + alpha_p_init: float = 0.8, + alpha_n_init: float = 0.8, + beta: float = 0.5, + eps: float = -1e-6, + dtype: torch.dtype = torch.bfloat16, + with_vector_loads: bool = False, + ): + super().__init__() + self.alpha_p = nn.Parameter( + torch.log(torch.exp(torch.tensor(alpha_p_init, dtype=dtype)) - + 1).unsqueeze(0)) + self.alpha_n = nn.Parameter( + torch.log( + torch.exp(torch.tensor(alpha_n_init - beta, dtype=dtype)) - + 1).unsqueeze(0)) + self.register_buffer("beta", torch.tensor(beta, dtype=dtype)) + self.register_buffer("eps", torch.tensor(eps, dtype=dtype)) + self.with_vector_loads = with_vector_loads + # Temporary until xIELU CUDA fully implemented + self._beta_scalar = float(self.beta.detach().cpu().float().item()) + self._eps_scalar = float(self.eps.detach().cpu().float().item()) + + self._xielu_cuda_obj = None + try: + import xielu.ops # noqa: F401 + + self._xielu_cuda_obj = torch.classes.xielu.XIELU() + msg = "Using experimental xIELU CUDA." + try: + from torch._dynamo import allow_in_graph + + self._xielu_cuda_fn = allow_in_graph(self._xielu_cuda) + msg += " Enabled torch._dynamo for xIELU CUDA." + except Exception as err: + msg += (f" Could not enable torch._dynamo for xIELU ({err}) - " + "this may result in slower performance.") + self._xielu_cuda_fn = self._xielu_cuda + logger.warning_once(msg) + except Exception as err: + logger.warning_once( + "CUDA-fused xIELU not available (%s) –" + " falling back to a Python version.\n" + "For CUDA xIELU (experimental), `pip install git+https://github.com/nickjbrowning/XIELU`", + str(err), + ) + + def _xielu_python(self, x: torch.Tensor) -> torch.Tensor: + alpha_p = nn.functional.softplus(self.alpha_p) + alpha_n = self.beta + nn.functional.softplus(self.alpha_n) + return torch.where( + x > 0, + alpha_p * x * x + self.beta * x, + (torch.expm1(torch.min(x, self.eps)) - x) * alpha_n + + self.beta * x, + ) + + def _xielu_cuda(self, x: torch.Tensor) -> torch.Tensor: + """Firewall function to prevent torch.compile from seeing .item()""" + assert self._xielu_cuda_obj is not None, ( + "XIELU CUDA object must not be None") + original_shape = x.shape + # CUDA kernel expects 3D tensors, reshape if needed + while x.dim() < 3: + x = x.unsqueeze(0) + if x.dim() > 3: + x = x.view(-1, 1, x.size(-1)) + if original_shape != x.shape: + logger.warning_once( + "Warning: xIELU input tensor expects 3 dimensions" + " but got (shape: %s). Reshaping to (shape: %s).", + original_shape, + x.shape, + ) + result = self._xielu_cuda_obj.forward( + x, + self.alpha_p, + self.alpha_n, + # Temporary until xIELU CUDA fully implemented -> + # self.{beta,eps}.item() + self._beta_scalar, + self._eps_scalar, + self.with_vector_loads, + ) + return result.view(original_shape) + + def forward(self, input: torch.Tensor) -> torch.Tensor: + if self._xielu_cuda_obj is not None and input.is_cuda: + if not torch._dynamo.is_compiling(): + return self._xielu_cuda_fn(input) + else: + logger.warning_once( + "torch._dynamo is compiling, using Python version of xIELU." + ) + return self._xielu_python(input) + + class ScaledActivation(nn.Module): """An activation function with post-scale parameters. @@ -426,6 +535,8 @@ _ACTIVATION_REGISTRY = LazyDict({ lambda: nn.Tanh(), "sigmoid": lambda: nn.Sigmoid(), + "xielu": + lambda: XIELU(), }) diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py new file mode 100644 index 0000000000000..0de683d2cd060 --- /dev/null +++ b/vllm/model_executor/models/apertus.py @@ -0,0 +1,576 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py +# Copyright 2025 The Swiss AI Initiative. +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate the architectural differences made by +# the Swiss AI Initiative that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only Apertus model compatible with HuggingFace weights.""" +from collections.abc import Iterable +from typing import Any, Optional, Union + +import torch +from torch import nn +from transformers import ApertusConfig + +from vllm.attention import Attention, AttentionType +from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, VllmConfig +from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size +from vllm.model_executor.layers.activation import XIELU +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.vocab_parallel_embedding import ( + DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, maybe_remap_kv_scale_name) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import IntermediateTensors + +from .interfaces import SupportsLoRA, SupportsPP +from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index, + is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) + + +class ApertusMLP(nn.Module): + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + quant_config: Optional[QuantizationConfig] = None, + bias: bool = False, + prefix: str = "", + reduce_results: bool = True, + ) -> None: + super().__init__() + self.up_proj = ColumnParallelLinear( + input_size=hidden_size, + output_size=intermediate_size, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.up_proj", + ) + self.down_proj = RowParallelLinear( + input_size=intermediate_size, + output_size=hidden_size, + bias=bias, + quant_config=quant_config, + reduce_results=reduce_results, + prefix=f"{prefix}.down_proj", + ) + if hidden_act != "xielu": + raise ValueError(f"Unsupported activation: {hidden_act}. " + "Only xIELU is supported for now.") + self.act_fn = XIELU() + + def forward(self, x): + x, _ = self.up_proj(x) + x = self.act_fn(x) + x, _ = self.down_proj(x) + return x + + +class ApertusAttention(nn.Module): + + def __init__( + self, + config: ApertusConfig, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + rope_theta: float = 10000, + rope_scaling: Optional[dict[str, Any]] = None, + max_position_embeddings: int = 8192, + quant_config: Optional[QuantizationConfig] = None, + bias: bool = False, + bias_o_proj: bool = False, + cache_config: Optional[CacheConfig] = None, + prefix: str = "", + attn_type: str = AttentionType.DECODER, + ) -> None: + super().__init__() + layer_idx = extract_layer_index(prefix) + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + # MistralConfig has an optional head_dim introduced by Mistral-Nemo + head_dim = getattr(config, "head_dim", None) + if head_dim is None: + head_dim = self.hidden_size // self.total_num_heads + self.head_dim = head_dim + # Phi models introduced a partial_rotary_factor parameter in the config + self.partial_rotary_factor = getattr(config, "partial_rotary_factor", + 1) + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.rope_theta = rope_theta + self.max_position_embeddings = max_position_embeddings + + self.qkv_proj = QKVParallelLinear( + hidden_size=hidden_size, + head_size=self.head_dim, + total_num_heads=self.total_num_heads, + total_num_kv_heads=self.total_num_kv_heads, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + + self.o_proj = RowParallelLinear( + input_size=self.total_num_heads * self.head_dim, + output_size=hidden_size, + bias=bias_o_proj, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + + self._init_rotary_emb(config, + rope_scaling=rope_scaling, + quant_config=quant_config) + + sliding_window = None + if layer_types := getattr(config, "layer_types", None): + is_sliding = layer_types[layer_idx] == "sliding_attention" + if is_sliding: + sliding_window = config.sliding_window + + attn_cls = (EncoderOnlyAttention + if attn_type == AttentionType.ENCODER_ONLY else Attention) + + self.attn = attn_cls( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + quant_config=quant_config, + per_layer_sliding_window=sliding_window, + attn_type=attn_type, + prefix=f"{prefix}.attn", + ) + + self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps) + self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q = self.q_norm(q.contiguous().view(-1, self.head_dim)).view_as(q) + k = self.k_norm(k.contiguous().view(-1, self.head_dim)).view_as(k) + q, k = self.rotary_emb(positions, q, k) + attn_output = self.attn(q, k, v) + output, _ = self.o_proj(attn_output) + return output + + def _init_rotary_emb(self, config: ApertusConfig, + rope_scaling: Optional[dict[str, Any]], + quant_config: Optional[QuantizationConfig]) -> None: + is_neox_style = True + is_gguf = quant_config and quant_config.get_name() == "gguf" + if is_gguf and config.model_type == "apertus": + is_neox_style = False + + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=int(self.partial_rotary_factor * self.head_dim), + max_position=self.max_position_embeddings, + base=self.rope_theta, + rope_scaling=rope_scaling, + is_neox_style=is_neox_style, + partial_rotary_factor=self.partial_rotary_factor, + ) + + +class ApertusDecoderLayer(nn.Module): + + def __init__( + self, + config: ApertusConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + rope_theta = getattr(config, "rope_theta", 10000) + rope_scaling = getattr(config, "rope_scaling", None) + if rope_scaling is not None and getattr( + config, "original_max_position_embeddings", None): + rope_scaling["original_max_position_embeddings"] = ( + config.original_max_position_embeddings) + max_position_embeddings = getattr(config, "max_position_embeddings", + 8192) + # Support abacusai/Smaug-72B-v0.1 with attention_bias + # Support internlm/internlm-7b with bias + attention_bias = getattr(config, "attention_bias", False) or getattr( + config, "bias", False) + bias_o_proj = attention_bias + # support internlm/internlm3-8b with qkv_bias + if hasattr(config, 'qkv_bias'): + attention_bias = config.qkv_bias + + # Apertus defaults to causal attention as it is a decoder-only model. + # You can override the HF config with `is_causal=False` to enable + # bidirectional attention, which is used in some embedding models + # (e.g. parasail-ai/GritLM-7B-vllm) + if getattr(config, "is_causal", True): + attn_type = AttentionType.DECODER + else: + attn_type = AttentionType.ENCODER_ONLY + + self.self_attn = ApertusAttention( + config=config, + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=getattr(config, "num_key_value_heads", + config.num_attention_heads), + rope_theta=rope_theta, + rope_scaling=rope_scaling, + max_position_embeddings=max_position_embeddings, + quant_config=quant_config, + bias=attention_bias, + bias_o_proj=bias_o_proj, + cache_config=cache_config, + prefix=f"{prefix}.self_attn", + attn_type=attn_type, + ) + self.mlp = ApertusMLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + bias=getattr(config, "mlp_bias", False), + prefix=f"{prefix}.mlp", + ) + self.attention_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.feedforward_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: Optional[torch.Tensor], + ) -> tuple[torch.Tensor, torch.Tensor]: + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.attention_layernorm(hidden_states) + else: + hidden_states, residual = self.attention_layernorm( + hidden_states, residual) + hidden_states = self.self_attn(positions=positions, + hidden_states=hidden_states) + + # Fully Connected + hidden_states, residual = self.feedforward_layernorm( + hidden_states, residual) + hidden_states = self.mlp(hidden_states) + return hidden_states, residual + + +@support_torch_compile +class ApertusModel(nn.Module): + + def __init__(self, + *, + vllm_config: VllmConfig, + prefix: str = "", + layer_type: type[nn.Module] = ApertusDecoderLayer): + super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + + self.config = config + self.quant_config = quant_config + lora_vocab = (lora_config.lora_extra_vocab_size * + (lora_config.max_loras or 1)) if lora_config else 0 + self.vocab_size = config.vocab_size + lora_vocab + self.org_vocab_size = config.vocab_size + if get_pp_group().is_first_rank or (config.tie_word_embeddings + and get_pp_group().is_last_rank): + self.embed_tokens = VocabParallelEmbedding( + self.vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + quant_config=quant_config, + ) + else: + self.embed_tokens = PPMissingLayer() + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + lambda prefix: layer_type(config=config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix), + prefix=f"{prefix}.layers", + ) + if get_pp_group().is_last_rank: + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + else: + self.norm = PPMissingLayer() + + self.aux_hidden_state_layers = tuple[int, ...]() + + self.make_empty_intermediate_tensors = ( + make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size)) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: Optional[torch.Tensor], + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors], + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors, tuple[torch.Tensor, + list[torch.Tensor]]]: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + aux_hidden_states = [] + for idx, layer in enumerate( + self.layers[self.start_layer:self.end_layer]): + if idx in self.aux_hidden_state_layers: + aux_hidden_states.append(hidden_states + residual) + hidden_states, residual = layer(positions, hidden_states, residual) + + if not get_pp_group().is_last_rank: + return IntermediateTensors({ + "hidden_states": hidden_states, + "residual": residual + }) + + hidden_states, _ = self.norm(hidden_states, residual) + + if len(aux_hidden_states) > 0: + return hidden_states, aux_hidden_states + return hidden_states + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + (".qkv_proj", ".q_proj", "q"), + (".qkv_proj", ".k_proj", "k"), + (".qkv_proj", ".v_proj", "v"), + ] + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + if ("rotary_emb.cos_cached" in name + or "rotary_emb.sin_cached" in name): + # Models trained using ColossalAI may include these tensors in + # the checkpoint. Skip them. + continue + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue + if "scale" in name: + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +class ApertusForCausalLM(nn.Module, SupportsLoRA, SupportsPP): + packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]} + + # LoRA specific attributes + embedding_modules = { + "embed_tokens": "input_embeddings", + "lm_head": "output_embeddings" + } + embedding_padding_modules = ["lm_head"] + + def __init__(self, + *, + vllm_config: VllmConfig, + prefix: str = "", + layer_type: type[nn.Module] = ApertusDecoderLayer): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + self.config = config + self.lora_config = lora_config + + self.model = self._init_model(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model"), + layer_type=layer_type) + + if get_pp_group().is_last_rank: + self.unpadded_vocab_size = config.vocab_size + if lora_config: + self.unpadded_vocab_size += lora_config.lora_extra_vocab_size + self.lm_head = ParallelLMHead( + self.unpadded_vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + padding_size=( + DEFAULT_VOCAB_PADDING_SIZE + # We need bigger padding if using lora for kernel + # compatibility + if not lora_config else + lora_config.lora_vocab_padding_size), + quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head"), + ) + if config.tie_word_embeddings: + self.lm_head = self.lm_head.tie_weights( + self.model.embed_tokens) + + logit_scale = getattr(config, "logit_scale", 1.0) + self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, + config.vocab_size, + logit_scale) + else: + self.lm_head = PPMissingLayer() + + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors) + + def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None: + self.model.aux_hidden_state_layers = layers + + def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]: + num_layers = len(self.model.layers) + return (2, num_layers // 2, num_layers - 3) + + def _init_model(self, + vllm_config: VllmConfig, + prefix: str = "", + layer_type: type[nn.Module] = ApertusDecoderLayer): + return ApertusModel(vllm_config=vllm_config, + prefix=prefix, + layer_type=layer_type) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + model_output = self.model(input_ids, positions, intermediate_tensors, + inputs_embeds) + return model_output + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader( + self, + skip_prefixes=(["lm_head."] + if self.config.tie_word_embeddings else None), + ) + return loader.load_weights(weights) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 9040189ee5585..98115f8623563 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -39,6 +39,7 @@ logger = init_logger(__name__) # yapf: disable _TEXT_GENERATION_MODELS = { # [Decoder-only] + "ApertusForCausalLM": ("apertus", "ApertusForCausalLM"), "AquilaModel": ("llama", "LlamaForCausalLM"), "AquilaForCausalLM": ("llama", "LlamaForCausalLM"), # AquilaChat2 "ArceeForCausalLM": ("arcee", "ArceeForCausalLM"), From 0a2f4c0793988d3cf0d47b5f771fb38231db4b2b Mon Sep 17 00:00:00 2001 From: Lukas Geiger Date: Fri, 29 Aug 2025 15:42:57 +0100 Subject: [PATCH 29/56] [Models] Use in-place adds in Idefics2Vision (#23932) Signed-off-by: Lukas Geiger --- vllm/model_executor/models/idefics2_vision_model.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index 88b2a295905b7..0ca2e9e4bb688 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -108,7 +108,7 @@ class Idefics2VisionEmbeddings(nn.Module): bucket_coords_w).flatten() position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids position_ids = position_ids.to(self.position_embedding.weight.device) - embeddings = embeddings + self.position_embedding(position_ids) + embeddings += self.position_embedding(position_ids) return embeddings @@ -262,11 +262,11 @@ class Idefics2EncoderLayer(nn.Module): residual = hidden_states hidden_states = self.layer_norm1(hidden_states) hidden_states = self.self_attn(hidden_states) - hidden_states = residual + hidden_states + hidden_states += residual residual = hidden_states hidden_states = self.layer_norm2(hidden_states) hidden_states = self.mlp(hidden_states) - hidden_states = residual + hidden_states + hidden_states += residual return hidden_states From d90d8eb674f3870b8c85515a468108d5f1bd609a Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 29 Aug 2025 08:17:27 -0700 Subject: [PATCH 30/56] [BugFix] Async scheduling and PP compatibility with DP (#23770) Signed-off-by: Nick Hill --- tests/v1/engine/test_engine_core.py | 59 +++++++++--------------- tests/v1/test_async_llm_dp.py | 6 ++- vllm/executor/ray_utils.py | 6 +++ vllm/v1/engine/core.py | 63 ++++++++++++++------------ vllm/v1/executor/abstract.py | 9 ++-- vllm/v1/executor/multiproc_executor.py | 15 ++++-- vllm/v1/worker/gpu_worker.py | 45 +++++++++--------- 7 files changed, 105 insertions(+), 98 deletions(-) diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index e6f7ebf25970f..98265c6349578 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -306,17 +306,17 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): # Schedule Batch 1: (10, req0) assert engine_core.step_with_batch_queue()[0] is None - assert engine_core.batch_queue.qsize() == 1 - scheduler_output = engine_core.batch_queue.queue[-1][1] + assert len(engine_core.batch_queue) == 1 + scheduler_output = engine_core.batch_queue[-1][1] assert scheduler_output.num_scheduled_tokens["0"] == 10 # num_computed_tokens should have been updated immediately. assert engine_core.scheduler.requests[ req0.request_id].num_computed_tokens == 10 # Schedule Batch 2: (2, req0), (8, req1) - assert engine_core.step_with_batch_queue()[0] is None - assert engine_core.batch_queue.qsize() == 2 - scheduler_output = engine_core.batch_queue.queue[-1][1] + assert engine_core.step_with_batch_queue()[0] == {} + assert len(engine_core.batch_queue) == 1 + scheduler_output = engine_core.batch_queue[-1][1] assert scheduler_output.num_scheduled_tokens["0"] == 2 assert scheduler_output.num_scheduled_tokens["1"] == 8 # num_computed_tokens should have been updated immediately. @@ -325,42 +325,32 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): assert engine_core.scheduler.get_num_unfinished_requests() == 2 - # Batch queue is full. Finish Batch 1. - engine_core.step_with_batch_queue() - - # Schedule Batch 3: (4, req1). Note that req0 cannot be scheduled + # Finish Batch 1 and schedule Batch 3: (4, req1). + # Note that req0 cannot be scheduled # because it is in the decoding stage now. engine_core.step_with_batch_queue() - assert engine_core.batch_queue.qsize() == 2 - scheduler_output = engine_core.batch_queue.queue[-1][1] + assert len(engine_core.batch_queue) == 1 + scheduler_output = engine_core.batch_queue[-1][1] assert scheduler_output.num_scheduled_tokens["1"] == 4 - # Batch queue is full. Finish Batch 2. Get first token of req0. + # Finish Batch 2. Get first token of req0. + # Schedule Batch 4: (1, req0). output = engine_core.step_with_batch_queue()[0].get(0) assert output is not None assert len(output.outputs) == 1 assert engine_core.scheduler.requests[req0.request_id].num_tokens == 13 - - # Schedule Batch 4: (1, req0). - engine_core.step_with_batch_queue() - assert engine_core.batch_queue.qsize() == 2 - scheduler_output = engine_core.batch_queue.queue[-1][1] + scheduler_output = engine_core.batch_queue[-1][1] assert scheduler_output.num_scheduled_tokens["0"] == 1 - # Batch queue is full. Finish Batch 3. Get first token of req1. + # Finish Batch 3. Get first token of req1. Schedule Batch 5: (1, req1). output = engine_core.step_with_batch_queue()[0].get(0) assert output is not None assert len(output.outputs) == 1 assert engine_core.scheduler.requests[req1.request_id].num_tokens == 13 - - # Schedule Batch 5: (1, req1). - engine_core.step_with_batch_queue() - assert engine_core.batch_queue.qsize() == 2 - scheduler_output = engine_core.batch_queue.queue[-1][1] + scheduler_output = engine_core.batch_queue[-1][1] assert scheduler_output.num_scheduled_tokens["1"] == 1 # Loop until req0 is finished. - step = 0 req_id = 0 expected_num_tokens = [ engine_core.scheduler.requests["0"].num_tokens + 1, @@ -368,19 +358,14 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): ] while engine_core.scheduler.get_num_unfinished_requests() == 2: output = engine_core.step_with_batch_queue()[0] - if step % 2 == 0: - # Even steps consumes an output. - assert output is not None - assert len(output[0].outputs) == 1 - if req_id in engine_core.scheduler.requests: - assert engine_core.scheduler.requests[ - req_id].num_tokens == expected_num_tokens[req_id] - expected_num_tokens[req_id] += 1 - req_id = (req_id + 1) % 2 - else: - # Odd steps schedules a new batch. - assert output is None - step += 1 + # Every step consumes an output. + assert output is not None + assert len(output[0].outputs) == 1 + if req_id in engine_core.scheduler.requests: + assert engine_core.scheduler.requests[ + req_id].num_tokens == expected_num_tokens[req_id] + expected_num_tokens[req_id] += 1 + req_id = (req_id + 1) % 2 @multi_gpu_test(num_gpus=2) diff --git a/tests/v1/test_async_llm_dp.py b/tests/v1/test_async_llm_dp.py index c2610a87ac780..32da58011be98 100644 --- a/tests/v1/test_async_llm_dp.py +++ b/tests/v1/test_async_llm_dp.py @@ -75,9 +75,10 @@ async def generate( ], ) @pytest.mark.parametrize("data_parallel_backend", ["mp", "ray"]) +@pytest.mark.parametrize("async_scheduling", [True, False]) @pytest.mark.asyncio -async def test_load(output_kind: RequestOutputKind, - data_parallel_backend: str): +async def test_load(output_kind: RequestOutputKind, data_parallel_backend: str, + async_scheduling: bool): stats_loggers = {} @@ -105,6 +106,7 @@ async def test_load(output_kind: RequestOutputKind, prompt = "This is a test of data parallel" engine_args.data_parallel_backend = data_parallel_backend + engine_args.async_scheduling = async_scheduling engine = AsyncLLM.from_engine_args(engine_args, stat_loggers=[SimpleStatsLogger]) after.callback(engine.shutdown) diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 7abaffa54c089..4b2a15afb67a7 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -10,6 +10,7 @@ import msgspec import vllm.platforms from vllm.config import ParallelConfig +from vllm.distributed import get_pp_group from vllm.executor.msgspec_utils import decode_hook, encode_hook from vllm.logger import init_logger from vllm.platforms import current_platform @@ -136,6 +137,11 @@ try: scheduler_output, intermediate_tensors) if isinstance(output, IntermediateTensors): output = scheduler_output, output + elif not get_pp_group().is_last_rank: + # Case where there are no scheduled requests + # but may still be finished requests. + assert not output or not output.req_ids + output = scheduler_output, None return output def override_env_vars(self, vars: Dict[str, str]): diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 785cbc9d8d565..922c06b44be88 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -138,12 +138,12 @@ class EngineCore: # schedule and execute batches, and is required by pipeline parallelism # to eliminate pipeline bubbles. self.batch_queue_size = self.model_executor.max_concurrent_batches - self.batch_queue: Optional[queue.Queue[tuple[Future[ModelRunnerOutput], - SchedulerOutput]]] = None + self.batch_queue: Optional[deque[tuple[Future[ModelRunnerOutput], + SchedulerOutput]]] = None if self.batch_queue_size > 1: logger.info("Batch queue is enabled with size %d", self.batch_queue_size) - self.batch_queue = queue.Queue(self.batch_queue_size) + self.batch_queue = deque(maxlen=self.batch_queue_size) self.request_block_hasher: Optional[Callable[[Request], list[BlockHash]]] = None @@ -319,41 +319,43 @@ class EngineCore: batch in the job queue is finished. 3. Update the scheduler from the output. """ - assert self.batch_queue is not None + batch_queue = self.batch_queue + assert batch_queue is not None - engine_core_outputs = None - scheduler_output = None # Try to schedule a new batch if the batch queue is not full, but # the scheduler may return an empty batch if all requests are scheduled. # Note that this is not blocking. - if not self.batch_queue.full(): + assert len(batch_queue) < self.batch_queue_size + + model_executed = False + if self.scheduler.has_requests(): scheduler_output = self.scheduler.schedule() - if scheduler_output.total_num_scheduled_tokens > 0: - future = self.model_executor.execute_model(scheduler_output) - self.batch_queue.put_nowait( - (future, scheduler_output)) # type: ignore + future = self.model_executor.execute_model(scheduler_output) + batch_queue.appendleft( + (future, scheduler_output)) # type: ignore[arg-type] - scheduled_batch = (scheduler_output is not None - and scheduler_output.total_num_scheduled_tokens > 0) + model_executed = scheduler_output.total_num_scheduled_tokens > 0 + if model_executed and len(batch_queue) < self.batch_queue_size \ + and not batch_queue[-1][0].done(): + # Don't block on next worker response unless the queue is full + # or there are no more requests to schedule. + return None, True - # If no more requests can be scheduled and the job queue is not empty, - # block until the first batch in the job queue is finished. - # TODO(comaniac): Ideally we should peek the first batch in the - # job queue to check if it's finished before scheduling a new batch, - # but peeking the first element in a queue is not thread-safe, - # so we need more work. - if not scheduled_batch and not self.batch_queue.empty(): - future, scheduler_output = self.batch_queue.get_nowait() + elif not batch_queue: + # Queue is empty. We should not reach here since this method should + # only be called when the scheduler contains requests or the queue + # is non-empty. + return None, False - # Blocking until the first result is available. - model_output = self.execute_model_with_error_logging( - lambda _: future.result(), scheduler_output) + # Block until the next result is available. + future, scheduler_output = batch_queue.pop() + model_output = self.execute_model_with_error_logging( + lambda _: future.result(), scheduler_output) - self.batch_queue.task_done() - engine_core_outputs = (self.scheduler.update_from_output( - scheduler_output, model_output)) + engine_core_outputs = self.scheduler.update_from_output( + scheduler_output, model_output) - return engine_core_outputs, scheduled_batch + return engine_core_outputs, model_executed def shutdown(self): self.structured_output_manager.clear_backend() @@ -388,7 +390,7 @@ class EngineCore: return self.model_executor.is_sleeping def execute_dummy_batch(self): - self.model_executor.collective_rpc("execute_dummy_batch") + self.model_executor.execute_dummy_batch() def add_lora(self, lora_request: LoRARequest) -> bool: return self.model_executor.add_lora(lora_request) @@ -733,7 +735,8 @@ class EngineCoreProc(EngineCore): """Exits when an engine step needs to be performed.""" waited = False - while not self.engines_running and not self.scheduler.has_requests(): + while not self.engines_running and not self.scheduler.has_requests() \ + and not self.batch_queue: if logger.isEnabledFor(DEBUG) and self.input_queue.empty(): logger.debug("EngineCore waiting for work.") waited = True diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py index 4be2f74177b1f..68408a0b8a3d5 100644 --- a/vllm/v1/executor/abstract.py +++ b/vllm/v1/executor/abstract.py @@ -81,12 +81,10 @@ class Executor(ExecutorBase): pass def determine_available_memory(self) -> list[int]: # in bytes - output = self.collective_rpc("determine_available_memory") - return output + return self.collective_rpc("determine_available_memory") def get_kv_cache_specs(self) -> list[dict[str, KVCacheSpec]]: - output = self.collective_rpc("get_kv_cache_spec") - return output + return self.collective_rpc("get_kv_cache_spec") def execute_model( self, @@ -96,6 +94,9 @@ class Executor(ExecutorBase): args=(scheduler_output, )) return output[0] + def execute_dummy_batch(self) -> None: + self.collective_rpc("execute_dummy_batch") + def take_draft_token_ids(self) -> Optional[DraftTokenIds]: output = self.collective_rpc("take_draft_token_ids") return output[0] diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 15b88a2128994..12e79ff165f4e 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -191,6 +191,10 @@ class MultiprocExecutor(Executor): outputs, self.output_rank) return self.kv_output_aggregator.aggregate(outputs, self.output_rank) + def execute_dummy_batch(self) -> None: + self.collective_rpc("execute_dummy_batch", + unique_reply_rank=self.output_rank) + def take_draft_token_ids(self) -> Optional[DraftTokenIds]: # OPTIMIZATION: Get output only from a single worker (output_rank) outputs = self.collective_rpc("take_draft_token_ids", @@ -242,12 +246,17 @@ class MultiprocExecutor(Executor): dequeue_timeout = None if deadline is None else ( deadline - time.monotonic()) - if non_block: + if self.io_thread_pool is not None: + # We must consume worker_response_mq from a single thread. result = self.io_thread_pool.submit( # type: ignore get_response, w, dequeue_timeout, self.shutdown_event) - else: + if not non_block: + result = result.result() + elif not non_block: result = get_response(w, dequeue_timeout) - + else: + raise RuntimeError("non_block can only be used when" + " max_concurrent_batches > 1") responses.append(result) return responses diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index c252193313344..2088bfff5bb39 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -354,36 +354,37 @@ class Worker(WorkerBase): scheduler_output: "SchedulerOutput", ) -> Optional[ModelRunnerOutput]: intermediate_tensors = None - if not get_pp_group().is_first_rank: + forward_pass = scheduler_output.total_num_scheduled_tokens > 0 + if forward_pass and not get_pp_group().is_first_rank: intermediate_tensors = IntermediateTensors( get_pp_group().recv_tensor_dict( all_gather_group=get_tp_group())) output = self.model_runner.execute_model(scheduler_output, intermediate_tensors) - - parallel_config = self.vllm_config.parallel_config - if parallel_config.distributed_executor_backend != "external_launcher" \ - and not get_pp_group().is_last_rank: - assert isinstance(output, IntermediateTensors) - get_pp_group().send_tensor_dict(output.tensors, - all_gather_group=get_tp_group()) - - kv_connector_output = output.kv_connector_output - if not kv_connector_output: - return None - - # In case of PP with kv transfer, we need to pass through the - # kv_connector_output - if (not kv_connector_output.finished_sending - and not kv_connector_output.finished_recving): - return EMPTY_MODEL_RUNNER_OUTPUT - - output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT) - output.kv_connector_output = kv_connector_output + if isinstance(output, ModelRunnerOutput): return output - assert isinstance(output, ModelRunnerOutput) + assert isinstance(output, IntermediateTensors) + parallel_config = self.vllm_config.parallel_config + assert parallel_config.distributed_executor_backend != ( + "external_launcher") and not get_pp_group().is_last_rank + + get_pp_group().send_tensor_dict(output.tensors, + all_gather_group=get_tp_group()) + + kv_connector_output = output.kv_connector_output + if not kv_connector_output: + return None + + # In case of PP with kv transfer, we need to pass through the + # kv_connector_output + if (not kv_connector_output.finished_sending + and not kv_connector_output.finished_recving): + return EMPTY_MODEL_RUNNER_OUTPUT + + output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT) + output.kv_connector_output = kv_connector_output return output def take_draft_token_ids(self) -> Optional[DraftTokenIds]: From 72a69132dc540fe7168ffdbb761412fa569f323f Mon Sep 17 00:00:00 2001 From: vllmellm Date: Fri, 29 Aug 2025 23:29:21 +0800 Subject: [PATCH 31/56] [CI] Add `aiter` to matching list of issue auto labeller for `rocm` tag (#23942) Signed-off-by: vllmellm --- .github/workflows/issue_autolabel.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/issue_autolabel.yml b/.github/workflows/issue_autolabel.yml index 6401d6586cc3d..e0ab3872d8fa3 100644 --- a/.github/workflows/issue_autolabel.yml +++ b/.github/workflows/issue_autolabel.yml @@ -49,6 +49,10 @@ jobs: term: "VLLM_ROCM_", searchIn: "both" }, + { + term: "aiter", + searchIn: "title" + }, { term: "rocm", searchIn: "title" From 0dc9532065c5f98952cb82d4c497e49ca09400bf Mon Sep 17 00:00:00 2001 From: yzds <41983536+youzhedian@users.noreply.github.com> Date: Sat, 30 Aug 2025 00:36:39 +0800 Subject: [PATCH 32/56] [BUGFIX ] fix undefined silu_and_mul_nvfp4_quant (#23929) Signed-off-by: hongchao Signed-off-by: Richard Zou Co-authored-by: hongchao Co-authored-by: Richard Zou Co-authored-by: Richard Zou --- csrc/ops.h | 4 ++-- csrc/torch_bindings.cpp | 3 ++- vllm/compilation/fix_functionalization.py | 4 +++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/csrc/ops.h b/csrc/ops.h index 78a487201bdd4..7a176a5c00322 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -130,8 +130,8 @@ void silu_and_mul(torch::Tensor& out, torch::Tensor& input); void silu_and_mul_quant(torch::Tensor& out, torch::Tensor& input, torch::Tensor& scale); -#ifndef USE_ROCM - +#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \ + (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120) void silu_and_mul_nvfp4_quant(torch::Tensor& out, torch::Tensor& output_block_scale, torch::Tensor& input, diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index b769c09adc0f0..56626a02c0277 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -115,7 +115,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "silu_and_mul_quant(Tensor! result, Tensor input, Tensor scale) -> ()"); ops.impl("silu_and_mul_quant", torch::kCUDA, &silu_and_mul_quant); -#ifndef USE_ROCM +#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \ + (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120) ops.def( "silu_and_mul_nvfp4_quant(Tensor! result, Tensor! result_block_scale, " "Tensor input, Tensor input_global_scale) -> ()"); diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py index a36dd8b845f1a..6bc721eec3d45 100644 --- a/vllm/compilation/fix_functionalization.py +++ b/vllm/compilation/fix_functionalization.py @@ -97,7 +97,9 @@ class FixFunctionalizationPass(VllmInductorPass): node, mutated_args, args=('result', 'input', 'scale')) - elif at_target == torch.ops._C.silu_and_mul_nvfp4_quant.default: + elif hasattr( + torch.ops._C, "silu_and_mul_nvfp4_quant" + ) and at_target == torch.ops._C.silu_and_mul_nvfp4_quant.default: mutated_args = {1: 'result', 2: 'result_block_scale'} self.defunctionalize(graph, node, From 4d7fe40fc0468b44404c32d87e4ae0158de24cdc Mon Sep 17 00:00:00 2001 From: 22quinn <33176974+22quinn@users.noreply.github.com> Date: Fri, 29 Aug 2025 10:09:55 -0700 Subject: [PATCH 33/56] [RL][BugFix] Fix missing tokenizer error for token-in-token-out (#23904) Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com> Co-authored-by: Cyrus Leung --- .../openai/test_token_in_token_out.py | 73 +++++++++++++++++++ vllm/entrypoints/openai/serving_completion.py | 6 +- vllm/entrypoints/openai/serving_engine.py | 38 +++++----- 3 files changed, 99 insertions(+), 18 deletions(-) create mode 100644 tests/entrypoints/openai/test_token_in_token_out.py diff --git a/tests/entrypoints/openai/test_token_in_token_out.py b/tests/entrypoints/openai/test_token_in_token_out.py new file mode 100644 index 0000000000000..ed003939c44be --- /dev/null +++ b/tests/entrypoints/openai/test_token_in_token_out.py @@ -0,0 +1,73 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import os +import tempfile + +import pytest + +from vllm.model_executor.model_loader.weight_utils import ( + download_weights_from_hf) +from vllm.transformers_utils.tokenizer import get_tokenizer + +from ...utils import RemoteOpenAIServer + +MODEL_NAME = "Qwen/Qwen3-0.6B" +MODEL_PATH = os.path.join(tempfile.gettempdir(), "qwen3_06b") + + +@pytest.fixture(scope="module") +def server(): + global MODEL_PATH + MODEL_PATH = download_weights_from_hf( + MODEL_NAME, + allow_patterns=["*"], + cache_dir=MODEL_PATH, + ignore_patterns=["tokenizer*", "vocab*", "*.safetensors"]) + args = [ + "--max-model-len", + "2048", + "--max-num-seqs", + "128", + "--enforce-eager", + "--skip-tokenizer-init", + "--load-format", + "dummy", + ] + with RemoteOpenAIServer(MODEL_PATH, args) as remote_server: + yield remote_server + + +@pytest.mark.asyncio +async def test_token_in_token_out_and_logprobs(server): + """ + Test token-in-token-out and token_ids align with prompt_logprobs + & logprobs when return_tokens_as_token_ids is enabled. + """ + tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME) + text = "Hello, world! How are you today?" + token_ids = tokenizer.encode(text) + async with server.get_async_client() as client: + # Test with both return_token_ids and return_tokens_as_token_ids enabled + completion = await client.completions.create( + model=MODEL_PATH, + prompt=token_ids, + max_tokens=20, + temperature=0, + echo=True, + extra_body={ + "return_token_ids": True, + }, + ) + + # Verify all fields are present + assert (completion.choices[0].token_ids is not None + and 0 < len(completion.choices[0].token_ids) <= 20) + assert completion.choices[0].prompt_token_ids is not None + + # Decode prompt tokens + if completion.choices[0].prompt_token_ids: + prompt_text = tokenizer.decode( + completion.choices[0].prompt_token_ids) + # The decoded prompt should match or close to original prompt + assert prompt_text == text diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index b81fd63ece7a4..f461d7609b945 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -127,7 +127,11 @@ class OpenAIServingCompletion(OpenAIServing): try: lora_request = self._maybe_get_adapters(request) - tokenizer = await self.engine_client.get_tokenizer(lora_request) + if self.model_config.skip_tokenizer_init: + tokenizer = None + else: + tokenizer = await self.engine_client.get_tokenizer(lora_request + ) request_prompts, engine_prompts = await self._preprocess_completion( request, diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index a97935e109ef2..ca6f3987936da 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -526,8 +526,8 @@ class OpenAIServing: async def _normalize_prompt_text_to_input( self, request: AnyRequest, - tokenizer: AnyTokenizer, prompt: str, + tokenizer: AnyTokenizer, truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]], add_special_tokens: bool, ) -> TextTokensPrompt: @@ -563,12 +563,10 @@ class OpenAIServing: async def _normalize_prompt_tokens_to_input( self, request: AnyRequest, - tokenizer: AnyTokenizer, prompt_ids: list[int], + tokenizer: Optional[AnyTokenizer], truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]], ) -> TextTokensPrompt: - async_tokenizer = self._get_async_tokenizer(tokenizer) - if truncate_prompt_tokens is None: input_ids = prompt_ids elif truncate_prompt_tokens < 0: @@ -576,7 +574,11 @@ class OpenAIServing: else: input_ids = prompt_ids[-truncate_prompt_tokens:] - input_text = await async_tokenizer.decode(input_ids) + if tokenizer is None: + input_text = "" + else: + async_tokenizer = self._get_async_tokenizer(tokenizer) + input_text = await async_tokenizer.decode(input_ids) return self._validate_input(request, input_ids, input_text) @@ -681,27 +683,27 @@ class OpenAIServing: [`_tokenize_prompt_input_or_inputs`][vllm.entrypoints.openai.serving_engine.OpenAIServing._tokenize_prompt_input_or_inputs] that assumes multiple inputs. """ - for text in prompt_inputs: - if isinstance(text, str): + for prompt in prompt_inputs: + if isinstance(prompt, str): yield await self._normalize_prompt_text_to_input( request, - tokenizer, - prompt=text, + prompt=prompt, + tokenizer=tokenizer, truncate_prompt_tokens=truncate_prompt_tokens, add_special_tokens=add_special_tokens, ) else: yield await self._normalize_prompt_tokens_to_input( request, - tokenizer, - prompt_ids=text, + prompt_ids=prompt, + tokenizer=tokenizer, truncate_prompt_tokens=truncate_prompt_tokens, ) async def _tokenize_prompt_input_or_inputs_async( self, request: AnyRequest, - tokenizer: AnyTokenizer, + tokenizer: Optional[AnyTokenizer], input_or_inputs: Optional[Union[str, list[str], list[int], list[list[int]]]], truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None, @@ -740,17 +742,19 @@ class OpenAIServing: tasks = [] for prompt_input in batch_inputs: if prompt_input["is_tokens"] is False: + assert tokenizer is not None, \ + "Tokenizer is required for text prompts" task = self._normalize_prompt_text_to_input( request, - tokenizer, prompt_input["content"], + tokenizer=tokenizer, truncate_prompt_tokens=truncate_prompt_tokens, add_special_tokens=add_special_tokens) else: task = self._normalize_prompt_tokens_to_input( request, - tokenizer, prompt_input["content"], + tokenizer=tokenizer, truncate_prompt_tokens=truncate_prompt_tokens) tasks.append(task) @@ -766,7 +770,7 @@ class OpenAIServing: request: Union[DetokenizeRequest, EmbeddingCompletionRequest, RerankRequest, ClassificationRequest, ScoreRequest, TokenizeCompletionRequest], - tokenizer: AnyTokenizer, + tokenizer: Optional[AnyTokenizer], input_or_inputs: Union[str, list[str], list[int], list[list[int]]], truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = ..., add_special_tokens: bool = ..., @@ -777,7 +781,7 @@ class OpenAIServing: async def _preprocess_completion( self, request: CompletionRequest, - tokenizer: AnyTokenizer, + tokenizer: Optional[AnyTokenizer], input_or_inputs: Optional[Union[str, list[str], list[int], list[list[int]]]], truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = ..., @@ -789,7 +793,7 @@ class OpenAIServing: async def _preprocess_completion( self, request: CompletionLikeRequest, - tokenizer: AnyTokenizer, + tokenizer: Optional[AnyTokenizer], input_or_inputs: Optional[Union[str, list[str], list[int], list[list[int]]]], truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None, From b7adf94c4a6c7290dd8765819da68a801008f5a1 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 29 Aug 2025 13:28:35 -0400 Subject: [PATCH 34/56] Tuned H100/H200 triton fp8 block configs for fused_qkv_a_proj (#23939) Signed-off-by: mgoin --- benchmarks/kernels/bench_block_fp8_gemm.py | 1 + .../kernels/benchmark_w8a8_block_fp8.py | 1 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 126 ++++++++++++++- 4 files changed, 271 insertions(+), 3 deletions(-) create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/benchmarks/kernels/bench_block_fp8_gemm.py b/benchmarks/kernels/bench_block_fp8_gemm.py index 883f0cf7e55f1..9663503e9baa0 100644 --- a/benchmarks/kernels/bench_block_fp8_gemm.py +++ b/benchmarks/kernels/bench_block_fp8_gemm.py @@ -16,6 +16,7 @@ assert current_platform.is_cuda(), ( # DeepSeek-V3 weight shapes DEEPSEEK_V3_SHAPES = [ (512 + 64, 7168), + (2112, 7168), ((128 + 64) * 128, 7168), (128 * (128 + 128), 512), (7168, 16384), diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py index e648a91077fdb..98bde9d83c82d 100644 --- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py +++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py @@ -141,6 +141,7 @@ def get_weight_shapes(tp_size): # cannot TP total = [ (512 + 64, 7168), + (2112, 7168), ((128 + 64) * 128, 7168), (128 * (128 + 128), 512), (7168, 16384), diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..f81e09e198c86 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json index fbca5ce05d018..e073843af64c5 100644 --- a/vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +++ b/vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -1,10 +1,130 @@ { - "2048": { - "BLOCK_SIZE_M": 256, + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, - "num_warps": 8, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, "num_stages": 3 }, "3072": { From 1c26b4229673ba44eb418b7f60882daed34facc9 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Fri, 29 Aug 2025 20:47:58 +0200 Subject: [PATCH 35/56] [Docs] [V1] [Hybrid] Add new documentation re: contributing mamba-based models (#23824) Signed-off-by: Thomas Parnell --- docs/contributing/model/basic.md | 28 ++++++++++++++++++++++++++++ docs/usage/v1_guide.md | 12 +++++------- 2 files changed, 33 insertions(+), 7 deletions(-) diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md index 21b1f21d60a35..aafdb1058e03c 100644 --- a/docs/contributing/model/basic.md +++ b/docs/contributing/model/basic.md @@ -121,3 +121,31 @@ To support a model with interleaving sliding windows, we need to take care of th - In the modeling code, parse the correct sliding window value for every layer, and pass it to the attention layer's `per_layer_sliding_window` argument. For reference, check [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/model_executor/models/llama.py#L171). With these two steps, interleave sliding windows should work with the model. + +### How to support models that use Mamba? + +We consider 3 different scenarios: + +1. Models that use Mamba layers (either Mamba-1 or Mamba-2) but do not use attention layers. +2. Models that combine Mamba layers (either Mamba-1 or Mamba-2) together with attention layers. +3. Models that combine Mamba-like mechanisms (e.g., Linear Attention, ShortConv) together with attention layers. + +For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](gh-file:vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](gh-file:vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference. +The model should inherit protocol `IsAttentionFree` and also implement class methods `get_mamba_state_dtype_from_config` and `get_mamba_state_shape_from_config` to calculate the state shapes and data types from the config. +For the mamba layers themselves, please use the [`MambaMixer`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes. +Please *do not* use the `MambaCacheManager` (deprecated in V1) or replicate any of the V0-specific code paths in the existing model implementations. +V0-only classes and code will be removed in the very near future. +The model should also be added to the `MODELS_CONFIG_MAP` dictionary in to ensure that the runtime defaults are optimized. + +For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](gh-file:vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](gh-file:vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together). +These models should follow the same instructions as case (1), but they should inherit protocol `IsHybrid` (instead of `IsAttentionFree`) and it is *not* necessary to add them to the `MODELS_CONFIG_MAP` (their runtime defaults will be inferred from the protocol). + +For case (3), we recommend looking at the implementation of [`MiniMaxText01ForCausalLM`](gh-file:vllm/model_executor/models/minimax_text_01.py) or [`Lfm2ForCausalLM`](gh-file:vllm/model_executor/models/lfm2.py) as a reference, which use custom "mamba-like" layers `MiniMaxText01LinearAttention` and `ShortConv` respectively. +Please follow the same guidelines as case (2) for implementing these models. +We use "mamba-like" to refer to layers that posses a state that is updated in-place, rather than being appended-to (like KV cache for attention). +For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`. +It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers. +Please see [`LinearAttentionMetadata`](gh-file:vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](gh-file:v1/attention/backends/short_conv_attn.py) for examples of this. +Finally, if one wants to support torch compile and CUDA graphs, it necessary to wrap the call to the mamba-like layer inside a custom op and register it. +Please see the calls to `direct_register_custom_op` in or for examples of this. +The new custom op should then be added to the list `_attention_ops` in to ensure that piecewise CUDA graphs works as intended. diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md index 20234e7611333..f71805436a6ae 100644 --- a/docs/usage/v1_guide.md +++ b/docs/usage/v1_guide.md @@ -107,16 +107,14 @@ to enable simultaneous generation and embedding using the same engine instance i #### Mamba Models Models using selective state-space mechanisms instead of standard transformer attention are supported. -Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`) are supported. -Please note that prefix caching is not yet supported for these models. +Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`,`FalconMambaForCausalLM`) are supported. -Models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`, +Hybrid models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`, `Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`). -Please note that prefix caching is not yet supported for these models. -Hybrid models with mechanisms different to Mamba are also supported (e.g, `MiniMaxText01ForCausalLM`, `MiniMaxM1ForCausalLM`). -Please note that prefix caching is not yet supported for these models. -It is also necessary to enforce eager mode for these models in V1. +Hybrid models with mechanisms different to Mamba are also supported (e.g, `MiniMaxText01ForCausalLM`, `MiniMaxM1ForCausalLM`, `Lfm2ForCausalLM`). + +Please note that prefix caching is not yet supported for any of the above models. #### Encoder-Decoder Models From 8c3e199998cc5b1225328f2de01a7443fbb4f3cd Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Date: Fri, 29 Aug 2025 12:16:57 -0700 Subject: [PATCH 36/56] Revert gemma3n fast prefill changes (#23897) Signed-off-by: Yong Hoon Shin --- tests/v1/e2e/test_kv_sharing_fast_prefill.py | 1 + vllm/model_executor/models/gemma3n.py | 433 +++---------------- vllm/model_executor/models/gemma3n_mm.py | 2 +- 3 files changed, 74 insertions(+), 362 deletions(-) diff --git a/tests/v1/e2e/test_kv_sharing_fast_prefill.py b/tests/v1/e2e/test_kv_sharing_fast_prefill.py index 7bc7f44dd7ab1..6bc9b2b1d82d2 100644 --- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py +++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py @@ -64,6 +64,7 @@ def cleanup(llm: LLM, compilation_config: CompilationConfig): @fork_new_process_for_each_test @pytest.mark.parametrize("enforce_eager", [True]) +@pytest.mark.skip(reason="Disable until Gemma3n supports fast prefill") def test_kv_sharing_fast_prefill( monkeypatch: pytest.MonkeyPatch, enforce_eager: bool, diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py index 0e0e191e75fcf..ffec3408702c9 100644 --- a/vllm/model_executor/models/gemma3n.py +++ b/vllm/model_executor/models/gemma3n.py @@ -23,11 +23,9 @@ from torch import nn from transformers.models.gemma3n.configuration_gemma3n import Gemma3nTextConfig from vllm.attention import Attention -from vllm.compilation.backends import set_model_tag from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.forward_context import get_forward_context from vllm.logger import init_logger from vllm.model_executor.layers.activation import (_ACTIVATION_REGISTRY, GeluAndMul, @@ -47,7 +45,6 @@ from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from vllm.v1.attention.backends.utils import KVSharingFastPrefillMetadata from .interfaces import SupportsQuant from .utils import (AutoWeightsLoader, extract_layer_index, @@ -536,178 +533,7 @@ class Gemma3nDecoderLayer(nn.Module): return corrected_predictions -# This enables torch.compile if --kv-sharing-fast-prefill passed -@support_torch_compile(enable_if=lambda vllm_config: vllm_config.cache_config. - kv_sharing_fast_prefill) -class Gemma3nSelfDecoder(nn.Module): - """ - Includes altup embedding and self decoder layers - """ - - def __init__( - self, - *, - vllm_config: VllmConfig, - prefix: str = "", - decoder_layers: list[Gemma3nDecoderLayer], - layer_idx_start: int, - per_layer_model_projection: ColumnParallelLinear, - embed_scale_per_layer: torch.Tensor, - embed_tokens_per_layer: VocabParallelEmbedding, - per_layer_projection_norm: RMSNorm, - per_layer_input_scale: torch.Tensor, - altup_projections: nn.ModuleList, - eps: torch.Tensor, - embed_tokens: VocabParallelEmbedding, - embed_scale: torch.Tensor, - ): - super().__init__() - self.decoder_layers = decoder_layers - self.layer_idx_start = layer_idx_start - self.per_layer_model_projection = per_layer_model_projection - self.config = vllm_config.model_config.hf_config - self.embed_scale_per_layer = embed_scale_per_layer - self.embed_tokens_per_layer = embed_tokens_per_layer - self.per_layer_projection_norm = per_layer_projection_norm - self.per_layer_input_scale = per_layer_input_scale - self.altup_projections = altup_projections - self.eps = eps - self.embed_tokens = embed_tokens - self.embed_scale = embed_scale - - def get_per_layer_input_embeddings( - self, input_ids: torch.Tensor) -> torch.Tensor: - # Deal with the fact that vocab_size_per_layer_input < vocab_size - # which causes us to have some out of vocab tokens by setting - # those token ids to 0. This matches the HF implementation. - per_layer_inputs_mask = torch.logical_and( - input_ids >= 0, input_ids < self.config.vocab_size_per_layer_input) - per_layer_inputs_tokens = torch.where(per_layer_inputs_mask, input_ids, - torch.zeros_like(input_ids)) - return self.embed_tokens_per_layer( - per_layer_inputs_tokens) * self.embed_scale_per_layer - - def get_per_layer_inputs( - self, - hidden_states_0: torch.Tensor, - per_layer_inputs: Optional[torch.Tensor], - ) -> torch.Tensor: - per_layer_projection = self.per_layer_model_projection(hidden_states_0) - per_layer_projection = per_layer_projection.reshape( - *hidden_states_0.shape[:-1], - self.config.num_hidden_layers, - self.config.hidden_size_per_layer_input, - ) - per_layer_projection = self.per_layer_projection_norm( - per_layer_projection) - if per_layer_inputs is not None: - # Profiling run does not compute per_layer_inputs - per_layer_inputs = per_layer_projection + per_layer_inputs - per_layer_inputs *= self.per_layer_input_scale - else: - per_layer_inputs = per_layer_projection - return per_layer_inputs - - def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.embed_tokens(input_ids) * self.embed_scale - - def altup_embed(self, hidden_states_0: torch.Tensor) -> torch.Tensor: - # Altup embed. - hidden_states = [hidden_states_0] * self.config.altup_num_inputs - target_magnitude = torch.mean(hidden_states_0**2, dim=-1, - keepdim=True)**0.5 - for i in range(1, self.config.altup_num_inputs): - hidden_states[i] = self.altup_projections[i - 1](hidden_states[i]) - new_magnitude = torch.mean(hidden_states[i]**2, - dim=-1, - keepdim=True)**0.5 - hidden_states[i] *= target_magnitude / torch.maximum( - new_magnitude, self.eps) - hidden_states = torch.stack(hidden_states, dim=-1) - return hidden_states - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - inputs_embeds: Optional[torch.Tensor] = None, - per_layer_inputs: Optional[torch.Tensor] = None, - **kwargs, - ) -> tuple[torch.Tensor, torch.Tensor]: - if inputs_embeds is not None: - hidden_states_0 = inputs_embeds - else: - hidden_states_0 = self.get_input_embeddings(input_ids) - - adjusted_per_layer_inputs = self.get_per_layer_inputs( - hidden_states_0, per_layer_inputs) - hidden_states = self.altup_embed(hidden_states_0) - - # [altnum_inputs, num_tokens, hidden_size] - hidden_states = hidden_states.permute(2, 0, 1) - - for idx, layer in enumerate(self.decoder_layers): - layer_idx = idx + self.layer_idx_start - # [altup_num_inputs, num_tokens, hidden_size] - hidden_states = layer( - positions=positions, - hidden_states=hidden_states, - per_layer_input=adjusted_per_layer_inputs[:, layer_idx, :], - **kwargs, - ) - - # [num_tokens, hidden_size, altnum_inputs] - hidden_states = hidden_states.permute(1, 2, 0) - - return hidden_states, adjusted_per_layer_inputs - - -# This enables torch.compile if --kv-sharing-fast-prefill passed -@support_torch_compile(enable_if=lambda vllm_config: vllm_config.cache_config. - kv_sharing_fast_prefill) -class Gemma3nCrossDecoder(nn.Module): - """ - Cross-decoder layers - """ - - def __init__( - self, - *, - vllm_config: VllmConfig, - prefix: str = "", - decoder_layers: list[Gemma3nDecoderLayer], - layer_idx_start: int, - ): - super().__init__() - self.decoder_layers = decoder_layers - self.layer_idx_start = layer_idx_start - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - per_layer_inputs: torch.Tensor, - **kwargs, - ) -> torch.Tensor: - # [altnum_inputs, num_tokens, hidden_size] - hidden_states = hidden_states.permute(2, 0, 1) - for idx, layer in enumerate(self.decoder_layers): - layer_idx = idx + self.layer_idx_start - # [altup_num_inputs, num_tokens, hidden_size] - hidden_states = layer( - positions=positions, - hidden_states=hidden_states, - per_layer_input=per_layer_inputs[:, layer_idx, :], - **kwargs, - ) - # [num_tokens, hidden_size, altnum_inputs] - hidden_states = hidden_states.permute(1, 2, 0) - return hidden_states - - -# This disables torch.compile if --kv-sharing-fast-prefill passed -@support_torch_compile(enable_if=lambda vllm_config: not vllm_config. - cache_config.kv_sharing_fast_prefill) +@support_torch_compile class Gemma3nTextModel(nn.Module, SupportsQuant): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @@ -717,6 +543,7 @@ class Gemma3nTextModel(nn.Module, SupportsQuant): quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config + self.embed_tokens = VocabParallelEmbedding( config.vocab_size, config.hidden_size, @@ -786,211 +613,95 @@ class Gemma3nTextModel(nn.Module, SupportsQuant): lambda prefix: Gemma3nDecoderLayer( config, cache_config, quant_config, prefix=prefix), prefix=f"{prefix}.layers") - - self.eps = torch.tensor(torch.finfo().min) - - first_kv_shared_layer_idx = (config.num_hidden_layers - - config.num_kv_shared_layers) - # Layer idx 0-19 are self-decoder layers in You Only Cache Once (YOCO) - with set_model_tag("self_decoder"): - self.self_decoder = Gemma3nSelfDecoder( - vllm_config=vllm_config, - prefix=f"{prefix}.self_decoder", - decoder_layers=self.layers[:first_kv_shared_layer_idx], - layer_idx_start=0, - per_layer_model_projection=self.per_layer_model_projection, - embed_scale_per_layer=self.embed_scale_per_layer, - embed_tokens_per_layer=self.embed_tokens_per_layer, - per_layer_projection_norm=self.per_layer_projection_norm, - per_layer_input_scale=self.per_layer_input_scale, - altup_projections=self.altup_projections, - eps=self.eps, - embed_tokens=self.embed_tokens, - embed_scale=self.embed_scale, - ) - # Layer idx 20-30 are cross-decoder layers in YOCO - with set_model_tag("cross_decoder"): - self.cross_decoder = Gemma3nCrossDecoder( - vllm_config=vllm_config, - prefix=f"{prefix}.cross_decoder", - decoder_layers=self.layers[first_kv_shared_layer_idx:], - layer_idx_start=first_kv_shared_layer_idx, - ) - self.norm = RMSNorm( config.hidden_size, eps=config.rms_norm_eps, ) - - self.fast_prefill_enabled = cache_config.kv_sharing_fast_prefill - - if self.fast_prefill_enabled: - # Allocate static buffers for CUDAGraph - # TODO(sarckk): Extract this functionality to interface - max_num_tokens = vllm_config.scheduler_config.max_num_batched_tokens - device = next(self.parameters()).device - self.positions = torch.zeros(max_num_tokens, - dtype=torch.int64, - device=device) - self.hidden_states = torch.zeros( - (max_num_tokens, config.hidden_size, - self.config.altup_num_inputs), - dtype=self.embed_tokens.weight.dtype, - device=device, - ) - self.per_layer_inputs = torch.zeros( - (max_num_tokens, self.config.num_hidden_layers, - self.config.hidden_size_per_layer_input), - dtype=self.embed_tokens.weight.dtype, - device=device, - ) + self.eps = torch.tensor(torch.finfo().min) def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.self_decoder.get_input_embeddings(input_ids) + return self.embed_tokens(input_ids) * self.embed_scale - def fast_prefill_forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - inputs_embeds: Optional[torch.Tensor] = None, - per_layer_inputs: Optional[torch.Tensor] = None, - **kwargs, - ) -> torch.Tensor: - logits_indices_padded, num_logits_indices = None, None - attn_metadata = get_forward_context().attn_metadata - - # attn_metadata is None during dummy runs - if (self.fast_prefill_enabled and attn_metadata is not None): - assert isinstance(attn_metadata, dict) - # Last layer is a KV sharing layer - layer_attn_metadata = attn_metadata[ - self.layers[-1].self_attn.attn.layer_name] - if (isinstance(layer_attn_metadata, KVSharingFastPrefillMetadata)): - logits_indices_padded = ( - layer_attn_metadata.logits_indices_padded) - num_logits_indices = layer_attn_metadata.num_logits_indices - - # Copy inputs for cudagraph - batch_size = positions.size(0) - self.positions[:batch_size].copy_(positions) - self_decoder_hidden_states, per_layer_inputs_adjusted = \ - self.self_decoder( - input_ids=input_ids, - positions=self.positions[:batch_size], - inputs_embeds=inputs_embeds, - per_layer_inputs=per_layer_inputs, - **kwargs, - ) - - if logits_indices_padded is None: - logits_indices_padded = torch.arange( - positions.size(0), - dtype=positions.dtype, - device=positions.device, - ) - - # NOTE(sarckk): There is currently a bug caused by - # vLLM converting output of last piecewise CUDA graph - # to weakref, causing memory to be prematurely freed - # when there are multiple compilation units - # Keep .clone() until fix in - # https://github.com/vllm-project/vllm/pull/22282 - hidden_states = self_decoder_hidden_states.clone() - - # Copy inputs for cudagraph - num_padded_logits_indices = logits_indices_padded.size(0) - self.positions[:num_padded_logits_indices].copy_( - positions[logits_indices_padded]) - self.hidden_states[:num_padded_logits_indices].copy_( - self_decoder_hidden_states[logits_indices_padded]) - self.per_layer_inputs[:num_padded_logits_indices].copy_( - per_layer_inputs_adjusted[logits_indices_padded]) - cross_decoder_hidden_states = self.cross_decoder( - positions=self.positions[:num_padded_logits_indices], - hidden_states=self.hidden_states[:num_padded_logits_indices], - per_layer_inputs=self.per_layer_inputs[:num_padded_logits_indices], - **kwargs, - ) - - if num_logits_indices is not None: - assert num_logits_indices > 0 - # Merge cross-decoder and self-decoder hidden states - hidden_states[logits_indices_padded[:num_logits_indices]] = ( - cross_decoder_hidden_states[:num_logits_indices]) - else: - hidden_states = cross_decoder_hidden_states - - return hidden_states - - def normal_forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - inputs_embeds: Optional[torch.Tensor] = None, - per_layer_inputs: Optional[torch.Tensor] = None, - **kwargs, - ) -> torch.Tensor: - hidden_states, per_layer_inputs = self.self_decoder( - input_ids=input_ids, - positions=positions, - inputs_embeds=inputs_embeds, - per_layer_inputs=per_layer_inputs, - **kwargs, - ) - hidden_states = self.cross_decoder( - positions=positions, - hidden_states=hidden_states, - per_layer_inputs=per_layer_inputs, - **kwargs, - ) - return hidden_states - - def altup_unembed( - self, - hidden_states: torch.Tensor, - ) -> torch.Tensor: - # Altup unembed. - target_magnitude = torch.mean(hidden_states[..., 0]**2, - dim=-1, - keepdim=True)**0.5 - for i in range(1, self.config.altup_num_inputs): - hidden_states[..., i] = self.altup_unembed_projections[i - 1]( - hidden_states[..., i]) - new_magnitude = torch.mean(hidden_states[..., i]**2, - dim=-1, - keepdim=True)**0.5 - hidden_states[..., i] *= target_magnitude / torch.maximum( - new_magnitude, self.eps) - # [num_tokens,hidden_size, altup_num_inputs] -> [num_tokens,hidden_size] - hidden_states = torch.mean(hidden_states, dim=-1) - return hidden_states + def get_per_layer_input_embeddings( + self, input_ids: torch.Tensor) -> torch.Tensor: + # Deal with the fact that vocab_size_per_layer_input < vocab_size + # which causes us to have some out of vocab tokens by setting + # those token ids to 0. This matches the HF implementation. + per_layer_inputs_mask = torch.logical_and( + input_ids >= 0, input_ids < self.config.vocab_size_per_layer_input) + per_layer_inputs_tokens = torch.where(per_layer_inputs_mask, input_ids, + torch.zeros_like(input_ids)) + return self.embed_tokens_per_layer( + per_layer_inputs_tokens) * self.embed_scale_per_layer def forward( self, input_ids: Optional[torch.Tensor], positions: torch.Tensor, - per_layer_inputs: Optional[torch.Tensor] = None, + per_layer_inputs: torch.Tensor, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, **kwargs, ) -> Union[torch.Tensor, IntermediateTensors]: - if self.fast_prefill_enabled: - hidden_states = self.fast_prefill_forward( - input_ids, - positions, - inputs_embeds, - per_layer_inputs, - **kwargs, - ) + if inputs_embeds is not None: + hidden_states_0 = inputs_embeds else: - hidden_states = self.normal_forward( - input_ids, - positions, - inputs_embeds, - per_layer_inputs, + hidden_states_0 = self.get_input_embeddings(input_ids) + + per_layer_projection = self.per_layer_model_projection(hidden_states_0) + per_layer_projection = per_layer_projection.reshape( + *hidden_states_0.shape[:-1], + self.config.num_hidden_layers, + self.config.hidden_size_per_layer_input, + ) + per_layer_projection = self.per_layer_projection_norm( + per_layer_projection) + + if per_layer_inputs is not None: + # Profiling run does not compute per_layer_inputs + per_layer_inputs = per_layer_projection + per_layer_inputs + per_layer_inputs *= self.per_layer_input_scale + else: + per_layer_inputs = per_layer_projection + + # Altup embed. + hidden_states = [hidden_states_0] * self.config.altup_num_inputs + target_magnitude = torch.mean(hidden_states_0**2, dim=-1, + keepdim=True)**0.5 + for i in range(1, self.config.altup_num_inputs): + hidden_states[i] = self.altup_projections[i - 1](hidden_states[i]) + new_magnitude = torch.mean(hidden_states[i]**2, + dim=-1, + keepdim=True)**0.5 + hidden_states[i] *= target_magnitude / torch.maximum( + new_magnitude, self.eps) + hidden_states = torch.stack(hidden_states, dim=0) + + # Transformer blocks. + for layer_idx, layer in enumerate(self.layers): + # [altup_num_inputs, num_tokens, hidden_size] + hidden_states = layer( + positions=positions, + hidden_states=hidden_states, + per_layer_input=per_layer_inputs[:, layer_idx, :], **kwargs, ) - hidden_states = self.altup_unembed(hidden_states) + + # Altup unembed. + target_magnitude = torch.mean(hidden_states[0]**2, + dim=-1, + keepdim=True)**0.5 + for i in range(1, self.config.altup_num_inputs): + hidden_states[i] = self.altup_unembed_projections[i - 1]( + hidden_states[i]) + new_magnitude = torch.mean(hidden_states[i]**2, + dim=-1, + keepdim=True)**0.5 + hidden_states[i] *= target_magnitude / torch.maximum( + new_magnitude, self.eps) + # [altup_num_inputs,num_tokens,hidden_size] -> [num_tokens,hidden_size] + hidden_states = torch.mean(hidden_states, dim=0) + return self.norm(hidden_states) def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py index aba4f98ea5f33..d59dde1560aea 100644 --- a/vllm/model_executor/models/gemma3n_mm.py +++ b/vllm/model_executor/models/gemma3n_mm.py @@ -620,7 +620,7 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal): # NOTE (NickLucche) Each pass needs tokens to compute PLE so we cache # them here, as the model forward has only access to the input_embeds. if input_ids is not None: - per_layer_inputs = self.language_model.model.self_decoder.get_per_layer_input_embeddings( + per_layer_inputs = self.language_model.model.get_per_layer_input_embeddings( input_ids) per_layer_inputs = per_layer_inputs.reshape( -1, self.config.text_config.num_hidden_layers, From 5674a40366bae4cfc862c35b719e3dcac2587ac1 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 29 Aug 2025 20:37:24 +0100 Subject: [PATCH 37/56] [Misc] Make `download_weights_from_hf` more reliable (#23863) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .../model_loader/weight_utils.py | 49 ++++++++++++------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 3bb47f82d2f37..f87eeaa4563ff 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -278,33 +278,48 @@ def download_weights_from_hf( Returns: str: The path to the downloaded model weights. """ + assert len(allow_patterns) > 0 local_only = huggingface_hub.constants.HF_HUB_OFFLINE if not local_only: - # Before we download we look at that is available: - fs = HfFileSystem() - file_list = fs.ls(model_name_or_path, detail=False, revision=revision) + # Attempt to reduce allow_patterns to a single pattern + # so we only have to call snapshot_download once. + try: + fs = HfFileSystem() + file_list = fs.ls(model_name_or_path, + detail=False, + revision=revision) - # depending on what is available we download different things - for pattern in allow_patterns: - matching = fnmatch.filter(file_list, pattern) - if len(matching) > 0: - allow_patterns = [pattern] + # Use the first pattern found in the HF repo's files. + for pattern in allow_patterns: + matching = fnmatch.filter(file_list, pattern) + if len(matching) > 0: + allow_patterns = [pattern] break + except Exception as e: + logger.warning( + "Failed to get file list for '%s'. Trying each pattern in " + "allow_patterns individually until weights have been " + "downloaded. Error: %s", model_name_or_path, e) logger.info("Using model weights format %s", allow_patterns) # Use file lock to prevent multiple processes from # downloading the same model weights at the same time. with get_lock(model_name_or_path, cache_dir): start_time = time.perf_counter() - hf_folder = snapshot_download( - model_name_or_path, - allow_patterns=allow_patterns, - ignore_patterns=ignore_patterns, - cache_dir=cache_dir, - tqdm_class=DisabledTqdm, - revision=revision, - local_files_only=local_only, - ) + for allow_pattern in allow_patterns: + hf_folder = snapshot_download( + model_name_or_path, + allow_patterns=allow_pattern, + ignore_patterns=ignore_patterns, + cache_dir=cache_dir, + tqdm_class=DisabledTqdm, + revision=revision, + local_files_only=local_only, + ) + # If we have downloaded weights for this allow_pattern, + # we don't need to check the rest. + if any(Path(hf_folder).glob(allow_pattern)): + break time_taken = time.perf_counter() - start_time if time_taken > 0.5: logger.info("Time spent downloading weights for %s: %.6f seconds", From d660c98c1b59580af97d6c7dd162c7c8894d40ed Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 29 Aug 2025 15:40:04 -0700 Subject: [PATCH 38/56] [CI] Fix unavailable image remote URL (#23966) Signed-off-by: Roger Wang --- tests/models/multimodal/generation/vlm_utils/custom_inputs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py index c53243b42e384..c68ac8f57662c 100644 --- a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py +++ b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py @@ -119,7 +119,7 @@ def different_patch_input_cases_internvl(): def windows_attention_image_qwen2_5_vl(): # image from regression issue: https://github.com/vllm-project/vllm/issues/15122 - image_url = "https://aomediacodec.github.io/av1-avif/testFiles/Link-U/hato.jpg" + image_url = "https://github.com/AOMediaCodec/av1-avif/blob/main/testFiles/Link-U/hato.jpg?raw=true" image = Image.open(BytesIO(requests.get(image_url).content)) question = "Describe the image." From 5b31cb1781e594aae29d878b1acde3e2f900bc41 Mon Sep 17 00:00:00 2001 From: dubejf Date: Sat, 30 Aug 2025 00:36:39 -0400 Subject: [PATCH 39/56] [Bugfix] Fix --config arg expansion called from api_server.py (#23944) Signed-off-by: Jean-Francois Dube Co-authored-by: Jean-Francois Dube Co-authored-by: Cyrus Leung --- tests/entrypoints/openai/test_cli_args.py | 22 ++++++++++++++++++++++ vllm/utils/__init__.py | 7 +++++-- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py index b20838956d721..9a1c0ea13b54f 100644 --- a/tests/entrypoints/openai/test_cli_args.py +++ b/tests/entrypoints/openai/test_cli_args.py @@ -27,6 +27,28 @@ def serve_parser(): return make_arg_parser(parser) +### Test config parsing +def test_config_arg_parsing(serve_parser, cli_config_file): + args = serve_parser.parse_args([]) + assert args.port == 8000 + args = serve_parser.parse_args(['--config', cli_config_file]) + assert args.port == 12312 + args = serve_parser.parse_args([ + '--config', + cli_config_file, + '--port', + '9000', + ]) + assert args.port == 9000 + args = serve_parser.parse_args([ + '--port', + '9000', + '--config', + cli_config_file, + ]) + assert args.port == 9000 + + ### Tests for LoRA module parsing def test_valid_key_value_format(serve_parser): # Test old format: name=path diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 60bddc5b500b5..c5ed10326fd50 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -1976,13 +1976,16 @@ class FlexibleArgumentParser(ArgumentParser): config_args = self.load_config_file(file_path) - # 0th index is for {serve,chat,complete} + # 0th index might be the sub command {serve,chat,complete,...} # optionally followed by model_tag (only for serve) # followed by config args # followed by rest of cli args. # maintaining this order will enforce the precedence # of cli > config > defaults - if args[0] == "serve": + if args[0].startswith('-'): + # No sub command (e.g., api_server entry point) + args = config_args + args[0:index] + args[index + 2:] + elif args[0] == "serve": model_in_cli = len(args) > 1 and not args[1].startswith('-') model_in_config = any(arg == '--model' for arg in config_args) From 8fb85b7bb67408c725474c31fe2a8f980c250277 Mon Sep 17 00:00:00 2001 From: Xin Yang <105740670+xyang16@users.noreply.github.com> Date: Fri, 29 Aug 2025 21:36:48 -0700 Subject: [PATCH 40/56] Add routed_scaling_factor to MoE grouped topk (#23123) Signed-off-by: Xin Yang Co-authored-by: Michael Goin Co-authored-by: Cyrus Leung --- .../layers/fused_moe/cpu_fused_moe.py | 12 ++++++++++++ .../layers/fused_moe/fused_moe.py | 7 ++++--- vllm/model_executor/layers/fused_moe/layer.py | 18 ++++++++++++++++++ .../layers/fused_moe/rocm_aiter_fused_moe.py | 3 +++ .../layers/quantization/awq_marlin.py | 2 ++ .../layers/quantization/bitsandbytes.py | 2 ++ .../compressed_tensors_moe.py | 10 ++++++++++ .../layers/quantization/experts_int8.py | 2 ++ vllm/model_executor/layers/quantization/fp8.py | 4 +++- .../model_executor/layers/quantization/gguf.py | 2 ++ .../layers/quantization/gptq_marlin.py | 2 ++ .../layers/quantization/modelopt.py | 4 ++++ .../layers/quantization/moe_wna16.py | 2 ++ .../layers/quantization/mxfp4.py | 2 ++ .../layers/quantization/quark/quark_moe.py | 4 ++++ vllm/model_executor/layers/quantization/rtn.py | 2 ++ vllm/model_executor/models/deepseek_v2.py | 1 + vllm/model_executor/models/dots1.py | 1 + vllm/model_executor/models/glm4_moe.py | 1 + 19 files changed, 77 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py index 769a04b7de89d..0eec93601b3f2 100644 --- a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py @@ -21,6 +21,7 @@ def grouped_topk( num_expert_group: int = 0, topk_group: int = 0, scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, e_score_correction_bias: Optional[torch.Tensor] = None ) -> tuple[torch.Tensor, torch.Tensor]: assert hidden_states.shape[0] == gating_output.shape[0], ( @@ -65,6 +66,8 @@ def grouped_topk( if renormalize: topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) + if routed_scaling_factor != 1.0: + topk_weights = topk_weights * routed_scaling_factor return topk_weights, topk_ids.to(torch.int32) @@ -78,6 +81,7 @@ def select_experts( num_expert_group: Optional[int] = None, custom_routing_function: Optional[Callable] = None, scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, e_score_correction_bias: Optional[torch.Tensor] = None, ) -> tuple[torch.Tensor, torch.Tensor]: if use_grouped_topk: @@ -90,6 +94,7 @@ def select_experts( num_expert_group=num_expert_group, topk_group=topk_group, scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, e_score_correction_bias=e_score_correction_bias) elif custom_routing_function is None: assert scoring_func == "softmax" @@ -131,12 +136,15 @@ class IPEXFusedMOE: expert_map: Optional[torch.Tensor] = None, custom_routing_function: Optional[Callable] = None, scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, activation: str = "silu", ) -> torch.Tensor: assert activation == "silu", f"{activation} is not supported." assert not apply_router_weight_on_input + assert routed_scaling_factor == 1.0, \ + f"routed_scaling_factor {routed_scaling_factor} is not supported." return layer.ipex_fusion( x, use_grouped_topk, @@ -170,6 +178,7 @@ class SGLFusedMOE: expert_map: Optional[torch.Tensor] = None, custom_routing_function: Optional[Callable] = None, scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, activation: str = "silu", @@ -186,6 +195,7 @@ class SGLFusedMOE: num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, e_score_correction_bias=e_score_correction_bias, ) @@ -227,6 +237,7 @@ class CPUFusedMOE: expert_map: Optional[torch.Tensor] = None, custom_routing_function: Optional[Callable] = None, scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, activation: str = "silu", @@ -243,6 +254,7 @@ class CPUFusedMOE: num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, e_score_correction_bias=e_score_correction_bias, ) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 17a5c735a57fe..eb3e14180ecfe 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1011,7 +1011,8 @@ def grouped_topk( if renormalize: topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) - topk_weights = topk_weights * routed_scaling_factor + if routed_scaling_factor != 1.0: + topk_weights = topk_weights * routed_scaling_factor return topk_weights.to(torch.float32), topk_ids.to(torch.int32) @@ -1790,8 +1791,8 @@ def fused_moe( Defaults to False. - global_num_experts (int): The total number of experts in the global expert space. - - expert_map (Optional[torch.Tensor]): A tensor mapping expert indices - from the global expert space to the local expert space of the expert + - expert_map (Optional[torch.Tensor]): A tensor mapping expert indices + from the global expert space to the local expert space of the expert parallel shard. - w1_scale (Optional[torch.Tensor]): Optional scale to be used for w1. diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 5a87763c07211..3a2c9cbaf459e 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -244,6 +244,7 @@ class FusedMoEMethodBase(QuantizeMethodBase): expert_map: Optional[torch.Tensor] = None, custom_routing_function: Optional[Callable] = None, scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, activation: str = "silu", @@ -400,6 +401,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): expert_map: Optional[torch.Tensor] = None, custom_routing_function: Optional[Callable] = None, scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, activation: str = "silu", @@ -427,6 +429,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): expert_map=expert_map, custom_routing_function=custom_routing_function, scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, e_score_correction_bias=e_score_correction_bias, activation=activation, apply_router_weight_on_input=apply_router_weight_on_input, @@ -450,6 +453,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): expert_map: Optional[torch.Tensor] = None, custom_routing_function: Optional[Callable] = None, scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, activation: str = "silu", @@ -469,6 +473,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, e_score_correction_bias=e_score_correction_bias, indices_type=self.topk_indices_dtype, enable_eplb=enable_eplb, @@ -534,6 +539,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): expert_map: Optional[torch.Tensor] = None, custom_routing_function: Optional[Callable] = None, scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, activation: str = "silu", @@ -560,6 +566,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): expert_map, custom_routing_function, scoring_func, + routed_scaling_factor, e_score_correction_bias, apply_router_weight_on_input, activation, @@ -579,6 +586,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): expert_map: Optional[torch.Tensor] = None, custom_routing_function: Optional[Callable] = None, scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, activation: str = "silu", @@ -617,6 +625,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): expert_map: Optional[torch.Tensor] = None, custom_routing_function: Optional[Callable] = None, scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, activation: str = "silu", @@ -637,6 +646,9 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): raise NotImplementedError( "Expert score correction bias is not supported for TPU.") assert activation == "silu", f"{activation} is not supported for TPU." + assert routed_scaling_factor == 1.0, \ + f"routed_scaling_factor {routed_scaling_factor} is not supported " \ + f"for TPU." if enable_eplb is not False or expert_load_view is not None or \ logical_to_physical_map is not None or \ logical_replica_count is not None: @@ -766,6 +778,7 @@ class FusedMoE(CustomOp): prefix: str = "", custom_routing_function: Optional[Callable] = None, scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, activation: str = "silu", @@ -848,6 +861,7 @@ class FusedMoE(CustomOp): self.topk_group = topk_group self.custom_routing_function = custom_routing_function self.scoring_func = scoring_func + self.routed_scaling_factor = routed_scaling_factor self.e_score_correction_bias = e_score_correction_bias self.apply_router_weight_on_input = apply_router_weight_on_input self.activation = activation @@ -1416,6 +1430,7 @@ class FusedMoE(CustomOp): num_expert_group: Optional[int] = None, custom_routing_function: Optional[Callable] = None, scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, e_score_correction_bias: Optional[torch.Tensor] = None, indices_type: Optional[torch.dtype] = None, enable_eplb: bool = False, @@ -1460,6 +1475,7 @@ class FusedMoE(CustomOp): num_expert_group=num_expert_group, topk_group=topk_group, scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, e_score_correction_bias=e_score_correction_bias) if indices_type is not None: topk_ids = topk_ids.to(dtype=indices_type) @@ -1627,6 +1643,7 @@ class FusedMoE(CustomOp): num_expert_group=self.num_expert_group, custom_routing_function=self.custom_routing_function, scoring_func=self.scoring_func, + routed_scaling_factor=self.routed_scaling_factor, e_score_correction_bias=self.e_score_correction_bias, activation=self.activation, enable_eplb=self.enable_eplb, @@ -1695,6 +1712,7 @@ class FusedMoE(CustomOp): num_expert_group=self.num_expert_group, custom_routing_function=self.custom_routing_function, scoring_func=self.scoring_func, + routed_scaling_factor=self.routed_scaling_factor, e_score_correction_bias=self.e_score_correction_bias, activation=self.activation, apply_router_weight_on_input=self.apply_router_weight_on_input, diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py index b838fd798bbc0..f14f13e2ade9d 100644 --- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py @@ -267,6 +267,7 @@ def rocm_aiter_grouped_topk( num_expert_group: int = 0, topk_group: int = 0, scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, e_score_correction_bias: Optional[torch.Tensor] = None ) -> tuple[torch.Tensor, torch.Tensor]: token = hidden_states.shape[0] @@ -298,6 +299,8 @@ def rocm_aiter_grouped_topk( scoring_func, ) + if routed_scaling_factor != 1.0: + topk_weights = topk_weights * routed_scaling_factor return topk_weights, topk_ids diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index 287d66b06d6e9..8293d42ef4556 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -497,6 +497,7 @@ class AWQMoEMethod(FusedMoEMethodBase): expert_map: Optional[torch.Tensor] = None, custom_routing_function: Optional[Callable] = None, scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, activation: str = "silu", @@ -523,6 +524,7 @@ class AWQMoEMethod(FusedMoEMethodBase): num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, e_score_correction_bias=e_score_correction_bias, indices_type=self.topk_indices_dtype) diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index b7897a43793c7..9713757df9b07 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -466,6 +466,7 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase): expert_map: Optional[torch.Tensor] = None, custom_routing_function: Optional[Callable] = None, scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, activation: str = "silu", @@ -490,6 +491,7 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase): num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, e_score_correction_bias=e_score_correction_bias, indices_type=self.topk_indices_dtype) if self.quant_config.load_in_8bit: diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 2cad9ff0d321e..e4585419226cd 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -350,6 +350,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): expert_map: Optional[torch.Tensor] = None, custom_routing_function: Optional[Callable] = None, scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, activation: str = "silu", @@ -375,6 +376,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, e_score_correction_bias=e_score_correction_bias, indices_type=self.topk_indices_dtype, ) @@ -809,6 +811,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): expert_map: Optional[torch.Tensor] = None, custom_routing_function: Optional[Callable] = None, scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, activation: str = "silu", @@ -832,6 +835,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, e_score_correction_bias=e_score_correction_bias, indices_type=self.topk_indices_dtype, ) @@ -1057,6 +1061,7 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod): expert_map: Optional[torch.Tensor] = None, custom_routing_function: Optional[Callable] = None, scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, activation: str = "silu", @@ -1084,6 +1089,7 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod): num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, e_score_correction_bias=e_score_correction_bias, indices_type=self.topk_indices_dtype) @@ -1361,6 +1367,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod): expert_map: Optional[torch.Tensor] = None, custom_routing_function: Optional[Callable] = None, scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, activation: str = "silu", @@ -1389,6 +1396,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod): num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, e_score_correction_bias=e_score_correction_bias, indices_type=self.topk_indices_dtype) @@ -1592,6 +1600,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod): expert_map: Optional[torch.Tensor] = None, custom_routing_function: Optional[Callable] = None, scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, activation: str = "silu", @@ -1618,6 +1627,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod): num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, e_score_correction_bias=e_score_correction_bias, indices_type=self.topk_indices_dtype) diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py index 3e43caa4cbf72..2d8a684bc7d90 100644 --- a/vllm/model_executor/layers/quantization/experts_int8.py +++ b/vllm/model_executor/layers/quantization/experts_int8.py @@ -120,6 +120,7 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase): expert_map: Optional[torch.Tensor] = None, custom_routing_function: Optional[Callable] = None, scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, activation: str = "silu", @@ -146,6 +147,7 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase): num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, e_score_correction_bias=e_score_correction_bias, indices_type=self.topk_indices_dtype) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 0200b0e9ed001..48bac8697e466 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -955,6 +955,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): expert_map: Optional[torch.Tensor] = None, custom_routing_function: Optional[Callable] = None, scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, activation: str = "silu", @@ -994,7 +995,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): expert_offset=layer.ep_rank * layer.local_num_experts, local_num_experts=layer.local_num_experts, block_shape=self.quant_config.weight_block_size, - routed_scaling=1.0, + routed_scaling=routed_scaling_factor, ) else: assert (not renormalize @@ -1020,6 +1021,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, e_score_correction_bias=e_score_correction_bias, indices_type=self.topk_indices_dtype, enable_eplb=enable_eplb, diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index 90222f2e3b0e5..ad648df238194 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -532,6 +532,7 @@ class GGUFMoEMethod(FusedMoEMethodBase): expert_map: Optional[torch.Tensor] = None, custom_routing_function: Optional[Callable] = None, scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, activation: str = "silu", @@ -562,6 +563,7 @@ class GGUFMoEMethod(FusedMoEMethodBase): num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, e_score_correction_bias=e_score_correction_bias, indices_type=self.topk_indices_dtype) return fused_moe_gguf(x, layer.w13_qweight, layer.w2_qweight, diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index c5d1e017014f3..350975966668e 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -643,6 +643,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase): expert_map: Optional[torch.Tensor] = None, custom_routing_function: Optional[Callable] = None, scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, activation: str = "silu", @@ -669,6 +670,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase): num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, e_score_correction_bias=e_score_correction_bias, indices_type=self.topk_indices_dtype) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 1fbb2e3bb6f28..4bb8438d90844 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -483,6 +483,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): expert_map: Optional[torch.Tensor] = None, custom_routing_function: Optional[Callable] = None, scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, activation: str = "silu", @@ -521,6 +522,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, e_score_correction_bias=e_score_correction_bias, indices_type=self.topk_indices_dtype, ) @@ -1356,6 +1358,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): expert_map: Optional[torch.Tensor] = None, custom_routing_function: Optional[Callable] = None, scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, activation: str = "silu", @@ -1434,6 +1437,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, e_score_correction_bias=e_score_correction_bias, indices_type=self.topk_indices_dtype) diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index 0cde104cc75d7..fb3e4b518bf6c 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -297,6 +297,7 @@ class MoeWNA16Method(FusedMoEMethodBase): expert_map: Optional[torch.Tensor] = None, custom_routing_function: Optional[Callable] = None, scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, activation: str = "silu", @@ -322,6 +323,7 @@ class MoeWNA16Method(FusedMoEMethodBase): num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, e_score_correction_bias=e_score_correction_bias, indices_type=self.topk_indices_dtype) diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index f7d591328f93a..a2301779c77e4 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -546,6 +546,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): expert_map: Optional[torch.Tensor] = None, custom_routing_function: Optional[Callable] = None, scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, activation: str = "silu", @@ -569,6 +570,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, e_score_correction_bias=e_score_correction_bias) return torch.ops.vllm.fused_marlin_moe( diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index 58f56c6381b31..fdf03ded04480 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -218,6 +218,7 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): expert_map: Optional[torch.Tensor] = None, custom_routing_function: Optional[Callable] = None, scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, activation: str = "silu", @@ -244,6 +245,7 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, e_score_correction_bias=e_score_correction_bias, indices_type=self.topk_indices_dtype) @@ -380,6 +382,7 @@ class QuarkW4A4MXFp4MoEMethod(QuarkMoEMethod): expert_map: Optional[torch.Tensor] = None, custom_routing_function: Optional[Callable] = None, scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, activation: str = "silu", @@ -406,6 +409,7 @@ class QuarkW4A4MXFp4MoEMethod(QuarkMoEMethod): num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, e_score_correction_bias=e_score_correction_bias, indices_type=self.topk_indices_dtype) diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py index 8bdb50e07b137..8f72b8cbea7a7 100644 --- a/vllm/model_executor/layers/quantization/rtn.py +++ b/vllm/model_executor/layers/quantization/rtn.py @@ -283,6 +283,7 @@ class RTNMoEMethod(FusedMoEMethodBase): expert_map: Optional[torch.Tensor] = None, custom_routing_function: Optional[Callable] = None, scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, activation: str = "silu", @@ -309,6 +310,7 @@ class RTNMoEMethod(FusedMoEMethodBase): num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, e_score_correction_bias=e_score_correction_bias, indices_type=self.topk_indices_dtype) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index ed033954f7c08..61e8090411f27 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -160,6 +160,7 @@ class DeepseekV2MoE(nn.Module): topk_group=config.topk_group, prefix=f"{prefix}.experts", scoring_func=config.scoring_func, + routed_scaling_factor=self.routed_scaling_factor, e_score_correction_bias=self.gate.e_score_correction_bias, enable_eplb=self.enable_eplb, num_redundant_experts=self.n_redundant_experts) diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py index c386f8db9eec6..a5477af8694b4 100644 --- a/vllm/model_executor/models/dots1.py +++ b/vllm/model_executor/models/dots1.py @@ -137,6 +137,7 @@ class Dots1MoE(nn.Module): topk_group=config.topk_group, prefix=f"{prefix}.experts", scoring_func=config.scoring_func, + routed_scaling_factor=self.routed_scaling_factor, e_score_correction_bias=self.gate.e_score_correction_bias) if config.n_shared_experts is not None: diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index fcc63815ac56f..06ed453ec29f9 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -159,6 +159,7 @@ class Glm4MoE(nn.Module): topk_group=config.topk_group, prefix=f"{prefix}.experts", scoring_func="sigmoid", + routed_scaling_factor=self.routed_scaling_factor, e_score_correction_bias=self.gate.e_score_correction_bias, enable_eplb=self.enable_eplb, num_redundant_experts=self.n_redundant_experts) From ee52a32705988bcac1833feab34af977addf5cca Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 29 Aug 2025 21:41:25 -0700 Subject: [PATCH 41/56] [CI] Move testing image from remote URL to S3 (#23980) Signed-off-by: Roger Wang --- .../multimodal/generation/vlm_utils/custom_inputs.py | 11 ++++------- vllm/assets/image.py | 2 +- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py index c68ac8f57662c..e369416fc49cc 100644 --- a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py +++ b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py @@ -1,12 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Custom input builders for edge-cases in different models.""" -from io import BytesIO from typing import Callable -import requests -from PIL import Image - +from vllm.assets.image import ImageAsset from vllm.multimodal.image import rescale_image_size from vllm.multimodal.video import (rescale_video_size, resize_video, sample_frames_from_video) @@ -118,9 +115,9 @@ def different_patch_input_cases_internvl(): def windows_attention_image_qwen2_5_vl(): - # image from regression issue: https://github.com/vllm-project/vllm/issues/15122 - image_url = "https://github.com/AOMediaCodec/av1-avif/blob/main/testFiles/Link-U/hato.jpg?raw=true" - image = Image.open(BytesIO(requests.get(image_url).content)) + + # image from regression issue: https://github.com/vllm-project/vllm/issues/15122 # noqa: E501 + image = ImageAsset("hato").pil_image question = "Describe the image." img_prompt = "<|vision_start|><|image_pad|><|vision_end|>" diff --git a/vllm/assets/image.py b/vllm/assets/image.py index c977242a3d484..c8f8d43a98355 100644 --- a/vllm/assets/image.py +++ b/vllm/assets/image.py @@ -11,7 +11,7 @@ from .base import get_vllm_public_assets VLM_IMAGES_DIR = "vision_model_images" -ImageAssetName = Literal["stop_sign", "cherry_blossom"] +ImageAssetName = Literal["stop_sign", "cherry_blossom", "hato"] @dataclass(frozen=True) From 9748c5198b492e22dc24d6eb455ec907369392f3 Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Date: Sat, 30 Aug 2025 00:14:43 -0700 Subject: [PATCH 42/56] [CI] Fix broken compile tests due to unsupported SiluMul+Nvfp4Quant fusion (#23973) Signed-off-by: Yong Hoon Shin Co-authored-by: Roger Wang --- vllm/compilation/activation_quant_fusion.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/vllm/compilation/activation_quant_fusion.py b/vllm/compilation/activation_quant_fusion.py index 40e124a03eb08..f2fbb1200eecc 100644 --- a/vllm/compilation/activation_quant_fusion.py +++ b/vllm/compilation/activation_quant_fusion.py @@ -29,8 +29,9 @@ SILU_MUL_OP = torch.ops._C.silu_and_mul.default FUSED_OPS: dict[QuantKey, OpOverload] = { kFp8StaticTensorSym: torch.ops._C.silu_and_mul_quant.default, # noqa: E501 } -if current_platform.is_cuda() and hasattr(torch.ops._C, - "silu_and_mul_nvfp4_quant"): +silu_and_mul_nvfp4_quant_supported = (current_platform.is_cuda() and hasattr( + torch.ops._C, "silu_and_mul_nvfp4_quant")) +if silu_and_mul_nvfp4_quant_supported: FUSED_OPS[ kNvfp4Quant] = torch.ops._C.silu_and_mul_nvfp4_quant.default # noqa: E501 @@ -171,8 +172,9 @@ class ActivationQuantFusionPass(VllmInductorPass): pattern_silu_mul_fp8 = SiluMulFp8StaticQuantPattern() pattern_silu_mul_fp8.register(self.patterns) - pattern_silu_mul_nvfp4 = SiluMulNvfp4QuantPattern() - pattern_silu_mul_nvfp4.register(self.patterns) + if silu_and_mul_nvfp4_quant_supported: + pattern_silu_mul_nvfp4 = SiluMulNvfp4QuantPattern() + pattern_silu_mul_nvfp4.register(self.patterns) def __call__(self, graph: torch.fx.Graph): self.begin() From f1bddbd852f37f98958d636821c45014c05e07a8 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 30 Aug 2025 15:14:58 +0800 Subject: [PATCH 43/56] [Core] Cleanup TPU model runner for MM (#23894) Signed-off-by: DarkLight1337 --- vllm/v1/worker/tpu_model_runner.py | 32 +----------------------------- 1 file changed, 1 insertion(+), 31 deletions(-) diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 2307006127085..985d5ba58c49c 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -808,31 +808,6 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): return per_layer_attn_metadata, logits_indices, padded_num_reqs,\ num_reqs, end_index - def _scatter_placeholders( - self, - embeds: torch.Tensor, - is_embed: Optional[torch.Tensor], - ) -> torch.Tensor: - if is_embed is None: - return embeds - - placeholders = embeds.new_full( - (is_embed.shape[0], embeds.shape[-1]), - fill_value=torch.nan, - ) - placeholders[is_embed] = embeds - return placeholders - - def _gather_placeholders( - self, - placeholders: torch.Tensor, - is_embed: Optional[torch.Tensor], - ) -> torch.Tensor: - if is_embed is None: - return placeholders - - return placeholders[is_embed] - def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"): scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs if not scheduled_encoder_inputs: @@ -892,12 +867,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # NOTE (NickLucche) here we diverge from logic in other runners, as we # assume to only have whole mm items to process. Hence we avoid the # intrinsic dynamism that `scatter_mm_placeholders` introduces. - for (mm_hash, pos_info), output in zip( - mm_hashes_pos, - encoder_outputs, - ): - if req_id not in self.encoder_cache: - self.encoder_cache[req_id] = {} + for (mm_hash, pos_info), output in zip(mm_hashes_pos, encoder_outputs): assert pos_info.is_embed is None, "Expected all positions to be"\ " contiguous and embeddings." self.encoder_cache[mm_hash] = output From 4071c76cf3cff46e14630f1e66cbf006b6eb51d3 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Sat, 30 Aug 2025 09:16:15 +0200 Subject: [PATCH 44/56] [V1] [Hybrid] Move MiniMaxLinearAttention into layers/mamba (#23831) Signed-off-by: Thomas Parnell Co-authored-by: Cyrus Leung --- .../layers/mamba/linear_attn.py | 442 ++++++++++++++++++ vllm/model_executor/models/minimax_text_01.py | 416 +---------------- 2 files changed, 448 insertions(+), 410 deletions(-) create mode 100644 vllm/model_executor/layers/mamba/linear_attn.py diff --git a/vllm/model_executor/layers/mamba/linear_attn.py b/vllm/model_executor/layers/mamba/linear_attn.py new file mode 100644 index 0000000000000..d93cef1a27ad4 --- /dev/null +++ b/vllm/model_executor/layers/mamba/linear_attn.py @@ -0,0 +1,442 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import math +from typing import TYPE_CHECKING, Optional, Union + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionBackend + +from typing import TYPE_CHECKING + +import torch +import torch.distributed +import torch.nn.functional as F +from einops import rearrange +from torch import nn + +from vllm import envs +from vllm.attention import AttentionMetadata +from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config +from vllm.distributed.communication_op import tensor_model_parallel_all_reduce +from vllm.distributed.parallel_state import ( + get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) +from vllm.forward_context import ForwardContext, get_forward_context +from vllm.model_executor.custom_op import CustomOp +from vllm.model_executor.layers.lightning_attn import ( + lightning_attention, linear_decode_forward_triton) +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.mamba.abstract import MambaBase +from vllm.model_executor.layers.mamba.mamba_utils import ( + MambaStateDtypeCalculator, MambaStateShapeCalculator) +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) +from vllm.platforms import current_platform +from vllm.utils import direct_register_custom_op +from vllm.v1.attention.backends.linear_attn import LinearAttentionMetadata + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionBackend + +import torch +import torch.distributed + +from vllm.model_executor.models.minimax_cache import MinimaxCacheParams + + +class MiniMaxText01RMSNormTP(CustomOp): + name = "MiniMaxText01RMSNormTP" + + def __init__(self, hidden_size: int, eps: float = 1e-6) -> None: + super().__init__() + self.tp_world = get_tensor_model_parallel_world_size() + self.tp_rank = get_tensor_model_parallel_rank() + self.weight = nn.Parameter(torch.ones(int(hidden_size / + self.tp_world))) + + self.weight.weight_loader = self.weight_loader + self.variance_epsilon = eps + return + + @staticmethod + def weight_loader( + param: nn.Parameter, + loaded_weight: torch.Tensor, + ) -> None: + tp_world = get_tensor_model_parallel_world_size() + tp_rank = get_tensor_model_parallel_rank() + + shard_size = loaded_weight.shape[0] // tp_world + shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size) + param.data.copy_(loaded_weight[shard]) + return + + def _forward( + self, + x: torch.Tensor, + ) -> torch.Tensor: + orig_dtype = x.dtype + x = x.to(torch.float32) + variance = x.pow(2).mean(dim=-1, keepdim=True, dtype=torch.float32) + if self.tp_world > 1: + variance = tensor_model_parallel_all_reduce( + variance) / self.tp_world + x = x * torch.rsqrt(variance + self.variance_epsilon) + + weight = self.weight + if x.size(-1) != self.weight.size(0): + if self.weight.size(0) < x.size(-1): + repeat_count = (x.size(-1) + self.weight.size(0)) // x.size(-1) + full_weight = self.weight.repeat(repeat_count) + weight = full_weight[:x.size(-1)] + else: + weight = self.weight[:x.size(-1)] + + x = x.to(orig_dtype) * weight + return x + + def forward( + self, + x: torch.Tensor, + residual: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: + assert residual is None, "RMSNorm does not support residual connection." + return self._forward(x) + + +class MiniMaxText01LinearKernel: + + @staticmethod + def jit_linear_forward_prefix(q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + kv_caches: torch.Tensor, + slope_rate: torch.Tensor, + block_size: int, + layer_idx: Optional[int] = None, + **kwargs) -> torch.Tensor: + + slope_rate = slope_rate.to(torch.float32) + should_pad_dim = q.dim() == 3 + if should_pad_dim: + q = q.unsqueeze(0) + k = k.unsqueeze(0) + v = v.unsqueeze(0) + b, h, n, d = q.shape + e = d + kv_history = kv_caches.reshape(1, h, d, e).contiguous() + output, kv_history = lightning_attention(q, + k, + v, + slope_rate, + block_size=block_size, + kv_history=kv_history) + kv_caches.copy_(kv_history[:, :, -1, :, :].reshape(h, d, e)) + assert output.shape[0] == 1, "batch size must be 1" + return rearrange(output.squeeze(0), "h n d -> n (h d)") + + +class MiniMaxText01LinearAttention(nn.Module, MambaBase): + + @property + def mamba_type(self) -> str: + return "linear_attention" + + def get_attn_backend(self) -> type["AttentionBackend"]: + from vllm.v1.attention.backends.linear_attn import ( + LinearAttentionBackend) + return LinearAttentionBackend + + def get_state_dtype(self) -> tuple[torch.dtype]: + assert self.model_config is not None + assert self.cache_config is not None + return MambaStateDtypeCalculator.linear_attention_state_dtype( + self.model_config.dtype, + self.cache_config.mamba_cache_dtype, + ) + + def get_state_shape(self) -> tuple[tuple[int, int, int], ...]: + return MambaStateShapeCalculator.linear_attention_state_shape( + num_heads=self.num_heads, + tp_size=self.tp_size, + head_dim=self.head_dim) + + def __init__( + self, + hidden_size: int, + hidden_inner_size: int, + num_heads: int, + head_dim: int, + max_position: int, + block_size: int, + num_hidden_layer: int, + model_config: Optional[ModelConfig] = None, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + layer_idx: int = 0, + linear_layer_idx: int = 0, + prefix: str = "linear_attn", + ) -> None: + super().__init__() + + self.layer_idx = layer_idx + self.BLOCK = block_size + self.hidden_size = hidden_size + self.num_heads = num_heads + self.head_dim = head_dim + self.total_num_heads = num_heads + self.hidden_inner_size = hidden_inner_size + self.tp_size = get_tensor_model_parallel_world_size() + self.tp_rank = get_tensor_model_parallel_rank() + + assert self.total_num_heads % self.tp_size == 0 + self.tp_heads = self.total_num_heads // self.tp_size + self.qkv_size = self.num_heads * self.head_dim + self.tp_hidden = self.head_dim * self.tp_heads + self.model_config = model_config + self.cache_config = cache_config + self.prefix = prefix + + self.qkv_proj = ColumnParallelLinear( + hidden_size, + self.hidden_inner_size * 3, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + self.output_gate = ColumnParallelLinear( + hidden_size, + self.hidden_inner_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.output_gate", + ) + self.out_proj = RowParallelLinear( + self.hidden_inner_size, + hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.out_proj", + ) + self.norm = MiniMaxText01RMSNormTP( + self.hidden_inner_size, + eps=1e-5, + ) + + slope_rate = MiniMaxText01LinearAttention._build_slope_tensor( + self.num_heads) + if num_hidden_layer <= 1: + self.slope_rate = slope_rate * (1 + 1e-5) + else: + self.slope_rate = slope_rate * (1 - layer_idx / + (num_hidden_layer - 1) + 1e-5) + self.tp_slope = self.slope_rate[self.tp_rank * + self.tp_heads:(self.tp_rank + 1) * + self.tp_heads].contiguous() + + if envs.VLLM_USE_V1: + compilation_config = get_current_vllm_config().compilation_config + if prefix in compilation_config.static_forward_context: + raise ValueError(f"Duplicate layer name: {prefix}") + compilation_config.static_forward_context[prefix] = self + + @staticmethod + def weight_direct_load(param: torch.Tensor, + loaded_weight: torch.Tensor) -> None: + assert param.size() == loaded_weight.size() + param.data.copy_(loaded_weight) + return + + @staticmethod + def _build_slope_tensor(n_attention_heads: int): + + def get_slopes(n): + + def get_slopes_power_of_2(n): + start = 2**(-(2**-(math.log2(n) - 3))) + ratio = start + return [start * ratio**i for i in range(n)] + + if math.log2(n).is_integer(): + return get_slopes_power_of_2(n) + else: + closest_power_of_2 = 2**math.floor(math.log2(n)) + return (get_slopes_power_of_2(closest_power_of_2) + get_slopes( + 2 * closest_power_of_2)[0::2][:n - closest_power_of_2]) + + slopes = torch.tensor(get_slopes(n_attention_heads), + dtype=torch.float32).reshape( + n_attention_heads, 1, 1) + return slopes + + def _prefill_and_mix_infer(self, q, k, v, kv_cache, state_indices_tensor, + attn_metadata): + hidden = [] + for _prefill_idx in range(getattr(attn_metadata, "num_prefills", 0)): + if _prefill_idx >= len(attn_metadata.query_start_loc): + break + if _prefill_idx >= len(state_indices_tensor): + break + # prefills are packed at end of batch in V1 + offset = attn_metadata.num_decode_tokens if envs.VLLM_USE_V1 else 0 + _start = attn_metadata.query_start_loc[offset + _prefill_idx] + _end = attn_metadata.query_start_loc[offset + _prefill_idx + 1] + slot_id = state_indices_tensor[offset + _prefill_idx] + qs = q[_start:_end].transpose(0, 1).contiguous() + ks = k[_start:_end].transpose(0, 1).contiguous() + vs = v[_start:_end].transpose(0, 1).contiguous() + slice_layer_cache = kv_cache[slot_id, ...] + + out_slice = MiniMaxText01LinearKernel.jit_linear_forward_prefix( + qs, + ks, + vs, + slice_layer_cache, + self.tp_slope, + self.BLOCK, + layer_idx=self.layer_idx) + hidden.append(out_slice.contiguous()) + if attn_metadata.num_decode_tokens > 0: + hidden_decode = self._decode_infer(q, k, v, kv_cache, + state_indices_tensor, + attn_metadata) + if envs.VLLM_USE_V1: + hidden.insert(0, hidden_decode) + else: + hidden.append(hidden_decode) + + if not hidden: + return torch.empty((0, q.size(-1)), device=q.device, dtype=q.dtype) + + hidden = torch.concat(hidden, dim=0).contiguous() + return hidden + + def _decode_infer(self, q, k, v, kv_cache, state_indices_tensor, + attn_metadata): + if not envs.VLLM_USE_V1: + q = q[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous() + k = k[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous() + v = v[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous() + num_prefills = getattr(attn_metadata, "num_prefills", 0) + slot_id = state_indices_tensor[num_prefills:] + else: + q = q[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous() + k = k[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous() + v = v[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous() + slot_id = state_indices_tensor[:attn_metadata.num_decodes] + hidden = linear_decode_forward_triton(q, k, v, kv_cache, self.tp_slope, + slot_id, 32) + return hidden + + def forward(self, hidden_states: torch.Tensor, output: torch.Tensor, + positions: torch.Tensor, + kv_caches: MinimaxCacheParams) -> None: + if not envs.VLLM_USE_V1: + self._forward(hidden_states, output, positions, kv_caches) + else: + torch.ops.vllm.linear_attention( + hidden_states, + output, + positions, + self.prefix, + ) + + def _forward(self, hidden_states: torch.Tensor, output: torch.Tensor, + positions: torch.Tensor, + kv_caches: Optional[MinimaxCacheParams]) -> None: + forward_context = get_forward_context() + attn_metadata: AttentionMetadata = forward_context.attn_metadata + if envs.VLLM_USE_V1 and attn_metadata is not None: + assert isinstance(attn_metadata, dict) + attn_metadata = attn_metadata[self.prefix] + assert isinstance(attn_metadata, LinearAttentionMetadata) + num_actual_tokens = attn_metadata.num_prefill_tokens + \ + attn_metadata.num_decode_tokens + else: + num_actual_tokens = hidden_states.shape[0] + + qkv, _ = self.qkv_proj(hidden_states[:num_actual_tokens]) + qkv32 = qkv.to(torch.float32) + qkvact = torch.nn.functional.silu(qkv32) + qkvact = qkvact.view((qkv.shape[0], self.tp_heads, -1)) + q, k, v = torch.split(qkvact, [self.head_dim] * 3, dim=-1) + if envs.VLLM_USE_V1: + if attn_metadata is not None: + kv_cache = self.kv_cache[forward_context.virtual_engine][0] + state_indices_tensor = attn_metadata.state_indices_tensor + + num_prefills = getattr(attn_metadata, "num_prefills", 0) + if num_prefills > 0: + num_decode_tokens = getattr(attn_metadata, + "num_decode_tokens", 0) + for prefill_idx in range(num_prefills): + q_start = attn_metadata.query_start_loc[ + num_decode_tokens + prefill_idx] + q_end = attn_metadata.query_start_loc[num_decode_tokens + + prefill_idx + + 1] + query_len = q_end - q_start + context_len = attn_metadata.seq_lens[ + num_decode_tokens + prefill_idx] - query_len + if context_len == 0: + block_to_clear = state_indices_tensor[ + num_decode_tokens + prefill_idx] + kv_cache[block_to_clear, ...] = 0 + else: + assert kv_caches is not None + kv_cache = kv_caches.minimax_cache + state_indices_tensor = kv_caches.state_indices_tensor + + decode_only = getattr(attn_metadata, "num_prefills", 0) == 0 + if attn_metadata is None: + hidden = torch.empty((q.shape[0], q.shape[1] * q.shape[2]), + device=q.device, + dtype=q.dtype) + else: + if not decode_only: + hidden = self._prefill_and_mix_infer(q, k, v, kv_cache, + state_indices_tensor, + attn_metadata) + else: + hidden = self._decode_infer(q, k, v, kv_cache, + state_indices_tensor, + attn_metadata) + hidden = self.norm._forward(hidden) + gate, _ = self.output_gate(hidden_states[:num_actual_tokens]) + hidden = F.sigmoid(gate) * hidden + hidden = hidden.to(hidden_states.dtype) + + output[:num_actual_tokens], _ = self.out_proj(hidden) + + +def linear_attention( + hidden_states: torch.Tensor, + output: torch.Tensor, + positions: torch.Tensor, + layer_name: str, +) -> None: + forward_context: ForwardContext = get_forward_context() + self = forward_context.no_compile_layers[layer_name] + self._forward(hidden_states=hidden_states, + output=output, + positions=positions, + kv_caches=None) + + +def linear_attention_fake( + hidden_states: torch.Tensor, + output: torch.Tensor, + positions: torch.Tensor, + layer_name: str, +) -> None: + return + + +direct_register_custom_op( + op_name="linear_attention", + op_func=linear_attention, + mutates_args=["output"], + fake_impl=linear_attention_fake, + dispatch_key=current_platform.dispatch_key, +) diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index 93ef13d5d16a0..ef1fe86c5b5c0 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -1,45 +1,37 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Inference-only MiniMaxText01 model.""" -import math from collections.abc import Iterable from itertools import islice from typing import TYPE_CHECKING, Optional, Union if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionBackend + pass import regex as re import torch import torch.distributed -import torch.nn.functional as F -from einops import rearrange from torch import nn from transformers import MiniMaxConfig from vllm import envs from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import (CacheConfig, ModelConfig, VllmConfig, - get_current_vllm_config) -from vllm.distributed.communication_op import tensor_model_parallel_all_reduce +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed.parallel_state import ( get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) -from vllm.forward_context import ForwardContext, get_forward_context -from vllm.model_executor.custom_op import CustomOp +from vllm.forward_context import get_forward_context from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.lightning_attn import ( - lightning_attention, linear_decode_forward_triton) -from vllm.model_executor.layers.linear import (ColumnParallelLinear, - MergedColumnParallelLinear, +from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, ReplicatedLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.mamba.abstract import MambaBase +from vllm.model_executor.layers.mamba.linear_attn import ( + MiniMaxText01LinearAttention) from vllm.model_executor.layers.mamba.mamba_utils import ( MambaStateDtypeCalculator, MambaStateShapeCalculator) from vllm.model_executor.layers.quantization.base_config import ( @@ -50,10 +42,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.utils import maybe_prefix from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors -from vllm.utils import direct_register_custom_op -from vllm.v1.attention.backends.linear_attn import LinearAttentionMetadata from .interfaces import HasInnerState, IsHybrid from .minimax_cache import MinimaxCacheManager, MinimaxCacheParams @@ -87,66 +76,6 @@ def weight_loader_with_alias(alias: str): return wrapper -class MiniMaxText01RMSNormTP(CustomOp): - name = "MiniMaxText01RMSNormTP" - - def __init__(self, hidden_size: int, eps: float = 1e-6) -> None: - super().__init__() - self.tp_world = get_tensor_model_parallel_world_size() - self.tp_rank = get_tensor_model_parallel_rank() - self.weight = nn.Parameter(torch.ones(int(hidden_size / - self.tp_world))) - - self.weight.weight_loader = self.weight_loader - self.variance_epsilon = eps - return - - @staticmethod - def weight_loader( - param: nn.Parameter, - loaded_weight: torch.Tensor, - ) -> None: - tp_world = get_tensor_model_parallel_world_size() - tp_rank = get_tensor_model_parallel_rank() - - shard_size = loaded_weight.shape[0] // tp_world - shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size) - param.data.copy_(loaded_weight[shard]) - return - - def _forward( - self, - x: torch.Tensor, - ) -> torch.Tensor: - orig_dtype = x.dtype - x = x.to(torch.float32) - variance = x.pow(2).mean(dim=-1, keepdim=True, dtype=torch.float32) - if self.tp_world > 1: - variance = tensor_model_parallel_all_reduce( - variance) / self.tp_world - x = x * torch.rsqrt(variance + self.variance_epsilon) - - weight = self.weight - if x.size(-1) != self.weight.size(0): - if self.weight.size(0) < x.size(-1): - repeat_count = (x.size(-1) + self.weight.size(0)) // x.size(-1) - full_weight = self.weight.repeat(repeat_count) - weight = full_weight[:x.size(-1)] - else: - weight = self.weight[:x.size(-1)] - - x = x.to(orig_dtype) * weight - return x - - def forward( - self, - x: torch.Tensor, - residual: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: - assert residual is None, "RMSNorm does not support residual connection." - return self._forward(x) - - class MiniMaxText01MLP(nn.Module): def __init__( @@ -253,307 +182,6 @@ class MiniMaxText01MoE(nn.Module): return final_hidden -class MiniMaxText01LinearKernel: - - @staticmethod - def jit_linear_forward_prefix(q: torch.Tensor, - k: torch.Tensor, - v: torch.Tensor, - kv_caches: torch.Tensor, - slope_rate: torch.Tensor, - block_size: int, - layer_idx: int = None, - **kwargs) -> torch.Tensor: - - slope_rate = slope_rate.to(torch.float32) - should_pad_dim = q.dim() == 3 - if should_pad_dim: - q = q.unsqueeze(0) - k = k.unsqueeze(0) - v = v.unsqueeze(0) - b, h, n, d = q.shape - e = d - kv_history = kv_caches.reshape(1, h, d, e).contiguous() - output, kv_history = lightning_attention(q, - k, - v, - slope_rate, - block_size=block_size, - kv_history=kv_history) - kv_caches.copy_(kv_history[:, :, -1, :, :].reshape(h, d, e)) - assert output.shape[0] == 1, "batch size must be 1" - return rearrange(output.squeeze(0), "h n d -> n (h d)") - - -class MiniMaxText01LinearAttention(nn.Module, MambaBase): - - @property - def mamba_type(self) -> str: - return "linear_attention" - - def get_attn_backend(self) -> type["AttentionBackend"]: - from vllm.v1.attention.backends.linear_attn import ( - LinearAttentionBackend) - return LinearAttentionBackend - - def get_state_dtype(self) -> tuple[torch.dtype]: - return MambaStateDtypeCalculator.linear_attention_state_dtype( - self.model_config.dtype, - self.cache_config.mamba_cache_dtype, - ) - - def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]: - return MambaStateShapeCalculator.linear_attention_state_shape( - num_heads=self.num_heads, - tp_size=self.tp_size, - head_dim=self.head_dim) - - def __init__( - self, - hidden_size: int, - hidden_inner_size: int, - num_heads: int, - head_dim: int, - max_position: int, - block_size: int, - num_hidden_layer: int, - model_config: Optional[ModelConfig] = None, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - layer_idx: int = 0, - linear_layer_idx: int = 0, - prefix: str = "linear_attn", - ) -> None: - super().__init__() - - self.layer_idx = layer_idx - self.BLOCK = block_size - self.hidden_size = hidden_size - self.num_heads = num_heads - self.head_dim = head_dim - self.total_num_heads = num_heads - self.hidden_inner_size = hidden_inner_size - self.tp_size = get_tensor_model_parallel_world_size() - self.tp_rank = get_tensor_model_parallel_rank() - - assert self.total_num_heads % self.tp_size == 0 - self.tp_heads = self.total_num_heads // self.tp_size - self.qkv_size = self.num_heads * self.head_dim - self.tp_hidden = self.head_dim * self.tp_heads - self.model_config = model_config - self.cache_config = cache_config - self.prefix = prefix - - self.qkv_proj = ColumnParallelLinear( - hidden_size, - self.hidden_inner_size * 3, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.qkv_proj", - ) - self.output_gate = ColumnParallelLinear( - hidden_size, - self.hidden_inner_size, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.output_gate", - ) - self.out_proj = RowParallelLinear( - self.hidden_inner_size, - hidden_size, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.out_proj", - ) - self.norm = MiniMaxText01RMSNormTP( - self.hidden_inner_size, - eps=1e-5, - ) - - slope_rate = MiniMaxText01LinearAttention._build_slope_tensor( - self.num_heads) - if num_hidden_layer <= 1: - self.slope_rate = slope_rate * (1 + 1e-5) - else: - self.slope_rate = slope_rate * (1 - layer_idx / - (num_hidden_layer - 1) + 1e-5) - self.tp_slope = self.slope_rate[self.tp_rank * - self.tp_heads:(self.tp_rank + 1) * - self.tp_heads].contiguous() - - if envs.VLLM_USE_V1: - compilation_config = get_current_vllm_config().compilation_config - if prefix in compilation_config.static_forward_context: - raise ValueError(f"Duplicate layer name: {prefix}") - compilation_config.static_forward_context[prefix] = self - - @staticmethod - def weight_direct_load(param: torch.Tensor, - loaded_weight: torch.Tensor) -> None: - assert param.size() == loaded_weight.size() - param.data.copy_(loaded_weight) - return - - @staticmethod - def _build_slope_tensor(n_attention_heads: int): - - def get_slopes(n): - - def get_slopes_power_of_2(n): - start = 2**(-(2**-(math.log2(n) - 3))) - ratio = start - return [start * ratio**i for i in range(n)] - - if math.log2(n).is_integer(): - return get_slopes_power_of_2(n) - else: - closest_power_of_2 = 2**math.floor(math.log2(n)) - return (get_slopes_power_of_2(closest_power_of_2) + get_slopes( - 2 * closest_power_of_2)[0::2][:n - closest_power_of_2]) - - slopes = torch.tensor(get_slopes(n_attention_heads), - dtype=torch.float32).reshape( - n_attention_heads, 1, 1) - return slopes - - def _prefill_and_mix_infer(self, q, k, v, kv_cache, state_indices_tensor, - attn_metadata): - hidden = [] - for _prefill_idx in range(getattr(attn_metadata, "num_prefills", 0)): - if _prefill_idx >= len(attn_metadata.query_start_loc): - break - if _prefill_idx >= len(state_indices_tensor): - break - # prefills are packed at end of batch in V1 - offset = attn_metadata.num_decode_tokens if envs.VLLM_USE_V1 else 0 - _start = attn_metadata.query_start_loc[offset + _prefill_idx] - _end = attn_metadata.query_start_loc[offset + _prefill_idx + 1] - slot_id = state_indices_tensor[offset + _prefill_idx] - qs = q[_start:_end].transpose(0, 1).contiguous() - ks = k[_start:_end].transpose(0, 1).contiguous() - vs = v[_start:_end].transpose(0, 1).contiguous() - slice_layer_cache = kv_cache[slot_id, ...] - - out_slice = MiniMaxText01LinearKernel.jit_linear_forward_prefix( - qs, - ks, - vs, - slice_layer_cache, - self.tp_slope, - self.BLOCK, - layer_idx=self.layer_idx) - hidden.append(out_slice.contiguous()) - if attn_metadata.num_decode_tokens > 0: - hidden_decode = self._decode_infer(q, k, v, kv_cache, - state_indices_tensor, - attn_metadata) - if envs.VLLM_USE_V1: - hidden.insert(0, hidden_decode) - else: - hidden.append(hidden_decode) - - if not hidden: - return torch.empty((0, q.size(-1)), device=q.device, dtype=q.dtype) - - hidden = torch.concat(hidden, dim=0).contiguous() - return hidden - - def _decode_infer(self, q, k, v, kv_cache, state_indices_tensor, - attn_metadata): - if not envs.VLLM_USE_V1: - q = q[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous() - k = k[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous() - v = v[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous() - num_prefills = getattr(attn_metadata, "num_prefills", 0) - slot_id = state_indices_tensor[num_prefills:] - else: - q = q[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous() - k = k[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous() - v = v[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous() - slot_id = state_indices_tensor[:attn_metadata.num_decodes] - hidden = linear_decode_forward_triton(q, k, v, kv_cache, self.tp_slope, - slot_id, 32) - return hidden - - def forward(self, hidden_states: torch.Tensor, output: torch.Tensor, - positions: torch.Tensor, - kv_caches: MinimaxCacheParams) -> None: - if not envs.VLLM_USE_V1: - self._forward(hidden_states, output, positions, kv_caches) - else: - torch.ops.vllm.linear_attention( - hidden_states, - output, - positions, - self.prefix, - ) - - def _forward(self, hidden_states: torch.Tensor, output: torch.Tensor, - positions: torch.Tensor, - kv_caches: Optional[MinimaxCacheParams]) -> None: - forward_context = get_forward_context() - attn_metadata: AttentionMetadata = forward_context.attn_metadata - if envs.VLLM_USE_V1 and attn_metadata is not None: - assert isinstance(attn_metadata, dict) - attn_metadata = attn_metadata[self.prefix] - assert isinstance(attn_metadata, LinearAttentionMetadata) - num_actual_tokens = attn_metadata.num_prefill_tokens + \ - attn_metadata.num_decode_tokens - else: - num_actual_tokens = hidden_states.shape[0] - - qkv, _ = self.qkv_proj(hidden_states[:num_actual_tokens]) - qkv32 = qkv.to(torch.float32) - qkvact = torch.nn.functional.silu(qkv32) - qkvact = qkvact.view((qkv.shape[0], self.tp_heads, -1)) - q, k, v = torch.split(qkvact, [self.head_dim] * 3, dim=-1) - if envs.VLLM_USE_V1: - if attn_metadata is not None: - kv_cache = self.kv_cache[forward_context.virtual_engine][0] - state_indices_tensor = attn_metadata.state_indices_tensor - - num_prefills = getattr(attn_metadata, "num_prefills", 0) - if num_prefills > 0: - num_decode_tokens = getattr(attn_metadata, - "num_decode_tokens", 0) - for prefill_idx in range(num_prefills): - q_start = attn_metadata.query_start_loc[ - num_decode_tokens + prefill_idx] - q_end = attn_metadata.query_start_loc[num_decode_tokens - + prefill_idx + - 1] - query_len = q_end - q_start - context_len = attn_metadata.seq_lens[ - num_decode_tokens + prefill_idx] - query_len - if context_len == 0: - block_to_clear = state_indices_tensor[ - num_decode_tokens + prefill_idx] - kv_cache[block_to_clear, ...] = 0 - else: - kv_cache = kv_caches.minimax_cache - state_indices_tensor = kv_caches.state_indices_tensor - - decode_only = getattr(attn_metadata, "num_prefills", 0) == 0 - if attn_metadata is None: - hidden = torch.empty((q.shape[0], q.shape[1] * q.shape[2]), - device=q.device, - dtype=q.dtype) - else: - if not decode_only: - hidden = self._prefill_and_mix_infer(q, k, v, kv_cache, - state_indices_tensor, - attn_metadata) - else: - hidden = self._decode_infer(q, k, v, kv_cache, - state_indices_tensor, - attn_metadata) - hidden = self.norm._forward(hidden) - gate, _ = self.output_gate(hidden_states[:num_actual_tokens]) - hidden = F.sigmoid(gate) * hidden - hidden = hidden.to(hidden_states.dtype) - output[:num_actual_tokens], _ = self.out_proj(hidden) - - class MiniMaxText01Attention(nn.Module): def __init__( @@ -1397,35 +1025,3 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid): tp_size=parallel_config.tensor_parallel_size, head_dim=hf_config.head_dim, ) - - -def linear_attention( - hidden_states: torch.Tensor, - output: torch.Tensor, - positions: torch.Tensor, - layer_name: str, -) -> None: - forward_context: ForwardContext = get_forward_context() - self = forward_context.no_compile_layers[layer_name] - self._forward(hidden_states=hidden_states, - output=output, - positions=positions, - kv_caches=None) - - -def linear_attention_fake( - hidden_states: torch.Tensor, - output: torch.Tensor, - positions: torch.Tensor, - layer_name: str, -) -> None: - return - - -direct_register_custom_op( - op_name="linear_attention", - op_func=linear_attention, - mutates_args=["output"], - fake_impl=linear_attention_fake, - dispatch_key=current_platform.dispatch_key, -) From 628d00cd7b06c9706b0613aafaefe927fb255877 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sat, 30 Aug 2025 19:16:11 +0800 Subject: [PATCH 45/56] [Bugfix] Fix test_lora_resolvers.py (#23984) Signed-off-by: Jee Jee Li --- tests/entrypoints/openai/test_lora_resolvers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py index f4801172580c6..818efd825640c 100644 --- a/tests/entrypoints/openai/test_lora_resolvers.py +++ b/tests/entrypoints/openai/test_lora_resolvers.py @@ -47,6 +47,7 @@ class MockModelConfig: allowed_local_media_path: str = "" encoder_config = None generation_config: str = "auto" + skip_tokenizer_init: bool = False def get_diff_sampling_param(self): return self.diff_sampling_param or {} From 5490d633cec0e6b946d3f5c2d56e6236ef42eb40 Mon Sep 17 00:00:00 2001 From: Ning Xie Date: Sat, 30 Aug 2025 19:22:14 +0800 Subject: [PATCH 46/56] [UT] fix unify_kv_cache_configs when kv cache config needs sort (#23843) --- tests/v1/core/test_kv_cache_utils.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index c4f927d69c2dd..e738f2bd46472 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -601,8 +601,14 @@ def test_unify_kv_cache_configs(): ] unify_kv_cache_configs(need_sort_kv_cache_config) - assert need_sort_kv_cache_config[0].num_blocks == 10 - assert need_sort_kv_cache_config[1].num_blocks == 10 + sorted_kv_cache_groups = [ + KVCacheGroupSpec(["layer1"], new_kv_cache_spec()), + KVCacheGroupSpec(["layer2"], new_kv_cache_spec(num_kv_heads=4)), + ] + assert ( + need_sort_kv_cache_config[0].kv_cache_groups == sorted_kv_cache_groups) + assert ( + need_sort_kv_cache_config[1].kv_cache_groups == sorted_kv_cache_groups) diff_kv_cache_config = [ KVCacheConfig( From 3a6acad43177d612654082ed1d56fb8d2c442179 Mon Sep 17 00:00:00 2001 From: Jiangyun Zhu Date: Sat, 30 Aug 2025 21:31:26 +0800 Subject: [PATCH 47/56] [Model] Enable encoder DP for MiniCPM-V (#23948) Signed-off-by: zjy0516 Signed-off-by: Jiangyun Zhu Co-authored-by: Cyrus Leung --- docs/configuration/optimization.md | 2 +- vllm/model_executor/models/minicpmv.py | 43 +++++++++++++++++--------- 2 files changed, 30 insertions(+), 15 deletions(-) diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index b11ccb5c00273..2d8cdcc11fa99 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -175,7 +175,7 @@ Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to u Known supported models: - Llama4 () -- MiniCPM-V-4 () +- MiniCPM-V-2.5 or above (, ) - Qwen2.5-VL () - Step3 () diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 0181bfeebda08..04176c5589ed6 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -977,6 +977,8 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP): instantiated. """ + supports_encoder_tp_data = True + @classmethod def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: if modality.startswith("image"): @@ -990,6 +992,7 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP): config = vllm_config.model_config.hf_config multimodal_config = vllm_config.model_config.multimodal_config quant_config = vllm_config.quant_config + self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" super().__init__() # All MiniCPM-V models disable `tie_word_embeddings` but # `PretrainedConfig.tie_word_embeddings` defaults to True; we cannot @@ -1237,6 +1240,8 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP): class MiniCPMV2_0(MiniCPMVBaseModel): + supports_encoder_tp_data = False + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__(vllm_config=vllm_config, prefix=prefix) assert self.version == (2, 0) @@ -1351,9 +1356,12 @@ class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA): quant_config: Optional[QuantizationConfig], prefix: str = "", ) -> nn.Module: - model = Idefics2VisionTransformer(config.vision_config, - quant_config=quant_config, - prefix=prefix) + model = Idefics2VisionTransformer( + config.vision_config, + quant_config=quant_config, + prefix=prefix, + use_data_parallel=self.use_data_parallel, + ) if self.config.drop_vision_last_layer: model.encoder.layers = model.encoder.layers[:-1] return model @@ -1441,9 +1449,12 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA): quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ) -> nn.Module: - model = Idefics2VisionTransformer(config.vision_config, - quant_config=quant_config, - prefix=prefix) + model = Idefics2VisionTransformer( + config.vision_config, + quant_config=quant_config, + prefix=prefix, + use_data_parallel=self.use_data_parallel, + ) if self.config.drop_vision_last_layer: model.encoder.layers = model.encoder.layers[:-1] return model @@ -1521,8 +1532,6 @@ class MiniCPMV4_0(MiniCPMVBaseModel, SupportsLoRA): ], } - supports_encoder_tp_data = True - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__(vllm_config=vllm_config, prefix=prefix) assert self.version == (4, 0) @@ -1546,9 +1555,12 @@ class MiniCPMV4_0(MiniCPMVBaseModel, SupportsLoRA): prefix: str = "", ) -> nn.Module: quant_config = self._maybe_ignore_quant_config(quant_config) - model = Idefics2VisionTransformer(config.vision_config, - quant_config=quant_config, - prefix=prefix) + model = Idefics2VisionTransformer( + config.vision_config, + quant_config=quant_config, + prefix=prefix, + use_data_parallel=self.use_data_parallel, + ) if self.config.drop_vision_last_layer: model.encoder.layers = model.encoder.layers[:-1] return model @@ -1652,9 +1664,12 @@ class MiniCPMV4_5(MiniCPMVBaseModel, SupportsLoRA): prefix: str = "", ) -> nn.Module: quant_config = self._maybe_ignore_quant_config(quant_config) - model = Idefics2VisionTransformer(config.vision_config, - quant_config=quant_config, - prefix=prefix) + model = Idefics2VisionTransformer( + config.vision_config, + quant_config=quant_config, + prefix=prefix, + use_data_parallel=self.use_data_parallel, + ) if self.config.drop_vision_last_layer: model.encoder.layers = model.encoder.layers[:-1] return model From 379ea2823a751ef43de7b3e7bcb3262b1e14c510 Mon Sep 17 00:00:00 2001 From: "sadegh.shokatian" Date: Sat, 30 Aug 2025 06:40:02 -0700 Subject: [PATCH 48/56] Add LoRA support for DeepSeek models (V2, V3, R1-0528) (#23971) Signed-off-by: sadeghja1070 Signed-off-by: Jee Jee Li Co-authored-by: Claude Co-authored-by: Jee Jee Li Co-authored-by: Cyrus Leung --- docs/models/supported_models.md | 6 +++--- vllm/model_executor/models/deepseek.py | 8 ++++++-- vllm/model_executor/models/deepseek_v2.py | 5 +++-- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 01c1090c6fca8..e8fe77e8d6c98 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -335,9 +335,9 @@ th { | `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R, Command-A | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, `CohereLabs/c4ai-command-a-03-2025`, `CohereLabs/command-a-reasoning-08-2025`, etc. | ✅︎ | ✅︎ | ✅︎ | | `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ | | `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | | ✅︎ | ✅︎ | -| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | | ✅︎ | ✅︎ | -| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3`, etc. | | ✅︎ | ✅︎ | +| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3`, `deepseek-ai/DeepSeek-R1`, `deepseek-ai/DeepSeek-V3.1`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Dots1ForCausalLM` | dots.llm1 | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst`, etc. | | ✅︎ | ✅︎ | | `Ernie4_5ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. |✅︎| ✅︎ | ✅︎ | diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index e815f13d66dcc..3f9349d766df6 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -52,7 +52,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from .interfaces import SupportsPP +from .interfaces import SupportsLoRA, SupportsPP from .utils import (AutoWeightsLoader, extract_layer_index, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, @@ -439,7 +439,11 @@ class DeepseekModel(nn.Module): return loaded_params -class DeepseekForCausalLM(nn.Module, SupportsPP): +class DeepseekForCausalLM(nn.Module, SupportsLoRA, SupportsPP): + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"], + } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 61e8090411f27..36c9427e474e9 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -56,7 +56,7 @@ from vllm.model_executor.model_loader.weight_utils import ( from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from .interfaces import MixtureOfExperts, SupportsPP +from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP from .utils import (PPMissingLayer, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -727,7 +727,8 @@ class DeepseekV2Model(nn.Module): return hidden_states -class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts): +class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts, + SupportsLoRA): packed_modules_mapping = { "gate_up_proj": ["gate_proj", "up_proj"], } From fb4983e112a81f4df25b92ab98c9c84a5babfec9 Mon Sep 17 00:00:00 2001 From: Ning Xie Date: Sat, 30 Aug 2025 21:41:45 +0800 Subject: [PATCH 49/56] [Misc] add reorder_batch AttentionMetadataBuilder (#23798) Signed-off-by: Andy Xie --- vllm/v1/attention/backends/utils.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index ad53b2e80bc73..011a90ece01bd 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -212,6 +212,23 @@ class AttentionMetadataBuilder(abc.ABC, Generic[M]): """ raise NotImplementedError + def reorder_batch(self, input_batch: "InputBatch", + scheduler_output: "SchedulerOutput") -> bool: + """ + Update the order of requests in the batch based on the attention + backend's needs. For example, some attention backends (namely MLA) may + want to separate requests based on if the attention computation will be + compute-bound or memory-bound. + + Args: + input_batch: input batch + scheduler_output: scheduler output. + + Returns: + True if the batch was modified, False otherwise. + """ + raise NotImplementedError + def build_for_cudagraph_capture( self, common_attn_metadata: CommonAttentionMetadata) -> M: """ From e80bca309eb866e6d62e081e6ad80f3e10c113e9 Mon Sep 17 00:00:00 2001 From: Ning Xie Date: Sat, 30 Aug 2025 21:42:25 +0800 Subject: [PATCH 50/56] [Refactor] refactor freezing_value/cuda_event initialize outside try finally (#23758) Signed-off-by: Andy Xie --- vllm/v1/worker/cpu_model_runner.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py index 226d7792a42f7..360a626979e54 100644 --- a/vllm/v1/worker/cpu_model_runner.py +++ b/vllm/v1/worker/cpu_model_runner.py @@ -128,8 +128,8 @@ def _torch_cuda_wrapper(): self.record = lambda: None self.synchronize = lambda: None + cuda_event = torch.cuda.Event try: - cuda_event = torch.cuda.Event torch.cuda.Event = _EventPlaceholder yield finally: @@ -141,9 +141,9 @@ def _set_global_compilation_settings(config: VllmConfig): import torch._inductor.config inductor_config = config.compilation_config.inductor_compile_config + # Note: The MKLDNN and CPPGEMM backend requires freezing parameters. + freezing_value = torch._inductor.config.freezing try: - # Note: The MKLDNN and CPPGEMM backend requires freezing parameters. - freezing_value = torch._inductor.config.freezing if inductor_config.get("max_autotune", False): torch._inductor.config.freezing = True yield From 68a349114f2a90f17645a8172496b0803d677f29 Mon Sep 17 00:00:00 2001 From: Ning Xie Date: Sat, 30 Aug 2025 21:43:33 +0800 Subject: [PATCH 51/56] [Misc] enhance type hint for rearrange return value (#23519) Signed-off-by: Andy Xie --- vllm/distributed/eplb/eplb_state.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py index 042acf40d67c2..d5ab61473ab01 100644 --- a/vllm/distributed/eplb/eplb_state.py +++ b/vllm/distributed/eplb/eplb_state.py @@ -409,12 +409,14 @@ class EplbState: self.expert_rearrangement_step = 0 self.rearrange(model) - def rearrange(self, - model: MixtureOfExperts, - is_profile: bool = False, - execute_shuffle: bool = True, - global_expert_load: Optional[torch.Tensor] = None, - rank_mapping: Optional[dict[int, int]] = None) -> None: + def rearrange( + self, + model: MixtureOfExperts, + is_profile: bool = False, + execute_shuffle: bool = True, + global_expert_load: Optional[torch.Tensor] = None, + rank_mapping: Optional[dict[int, + int]] = None) -> Optional[torch.Tensor]: """ Rearrange the experts according to the current load. """ @@ -548,6 +550,7 @@ class EplbState: " (profile) " if is_profile else " ", time_end - time_start, ) + return None @staticmethod def recv_state() -> tuple[torch.Tensor, torch.Tensor]: @@ -613,4 +616,4 @@ def _node_count_with_rank_mapping( if is_same_node and node_assignment[other_rank] == 0: node_assignment[other_rank] = next_node_id - return next_node_id \ No newline at end of file + return next_node_id From 038e9be4eb7a63189c8980845d80cb96957b9919 Mon Sep 17 00:00:00 2001 From: Andy Lo Date: Sat, 30 Aug 2025 16:37:39 +0100 Subject: [PATCH 52/56] [LoRA] Much faster startup when LoRA is enabled (#23777) Signed-off-by: Andy Lo Co-authored-by: Jee Jee Li --- vllm/v1/worker/gpu_model_runner.py | 11 ++++++--- vllm/v1/worker/gpu_worker.py | 5 +++- vllm/v1/worker/lora_model_runner_mixin.py | 30 ++++++++++++++++------- 3 files changed, 33 insertions(+), 13 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index c6d50c17f2b4d..d6717892d4aec 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2213,6 +2213,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): uniform_decode: bool = False, skip_eplb: bool = False, is_profile: bool = False, + remove_lora: bool = True, ) -> tuple[torch.Tensor, torch.Tensor]: """ Run a dummy forward pass to warm up/profile run or capture the @@ -2230,6 +2231,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): uniform_decode: If True, the batch is a uniform decode batch. skip_eplb: If True, skip EPLB state update. is_profile: If True, this is a profile run. + remove_lora: If False, dummy LoRAs are not destroyed after the run """ assert cudagraph_runtime_mode in { CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL @@ -2317,7 +2319,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): attn_metadata[layer_name] = attn_metadata_i with self.maybe_dummy_run_with_lora(self.lora_config, - num_scheduled_tokens): + num_scheduled_tokens, remove_lora): if self.supports_mm_inputs: input_ids = None inputs_embeds = self.inputs_embeds[:num_tokens] @@ -2708,11 +2710,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): cudagraph_runtime_mode=CUDAGraphMode.NONE, force_attention=force_attention, uniform_decode=uniform_decode, - skip_eplb=True) + skip_eplb=True, + remove_lora=False) self._dummy_run(num_tokens, cudagraph_runtime_mode=cudagraph_runtime_mode, uniform_decode=uniform_decode, - skip_eplb=True) + skip_eplb=True, + remove_lora=False) + self.maybe_remove_all_loras(self.lora_config) def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None: """ diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 2088bfff5bb39..2e7d6685377f2 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -308,7 +308,10 @@ class Worker(WorkerBase): # We skip EPLB here since we don't want to record dummy metrics for size in sorted(warmup_sizes, reverse=True): logger.info("Compile and warming up model for size %d", size) - self.model_runner._dummy_run(size, skip_eplb=True) + self.model_runner._dummy_run(size, + skip_eplb=True, + remove_lora=False) + self.model_runner.maybe_remove_all_loras(self.model_runner.lora_config) # Warmup and tune the kernels used during model execution before # cuda graph capture. diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py index 84ed46989ea97..4b5f27d27541b 100644 --- a/vllm/v1/worker/lora_model_runner_mixin.py +++ b/vllm/v1/worker/lora_model_runner_mixin.py @@ -5,7 +5,7 @@ Define LoRA functionality mixin for model runners. """ from contextlib import contextmanager -from typing import Union +from typing import Optional, Union import numpy as np import torch @@ -87,7 +87,9 @@ class LoRAModelRunnerMixin: lora_requests) @contextmanager - def maybe_setup_dummy_loras(self, lora_config): + def maybe_setup_dummy_loras(self, + lora_config: Optional[LoRAConfig], + remove_lora: bool = True): if lora_config is None: yield else: @@ -114,10 +116,11 @@ class LoRAModelRunnerMixin: yield # __exit__ code - self.lora_manager.remove_all_adapters() + if remove_lora: + self.lora_manager.remove_all_adapters() @contextmanager - def maybe_select_dummy_loras(self, lora_config: LoRAConfig, + def maybe_select_dummy_loras(self, lora_config: Optional[LoRAConfig], num_scheduled_tokens: np.ndarray): if lora_config is None: yield @@ -151,13 +154,22 @@ class LoRAModelRunnerMixin: yield @contextmanager - def maybe_dummy_run_with_lora(self, lora_config: LoRAConfig, - num_scheduled_tokens: np.ndarray): - with self.maybe_setup_dummy_loras( - lora_config), self.maybe_select_dummy_loras( - lora_config, num_scheduled_tokens): + def maybe_dummy_run_with_lora(self, + lora_config: Optional[LoRAConfig], + num_scheduled_tokens: np.ndarray, + remove_lora: bool = True): + with ( + self.maybe_setup_dummy_loras(lora_config, remove_lora), + self.maybe_select_dummy_loras(lora_config, + num_scheduled_tokens), + ): yield + def maybe_remove_all_loras(self, lora_config: Optional[LoRAConfig]): + if lora_config is None: + return + self.lora_manager.remove_all_adapters() + def add_lora(self, lora_request: LoRARequest) -> bool: if not self.lora_manager: raise RuntimeError("LoRA is not enabled.") From 5b8077b8ac42625a3465ad1f885e409d33e0e42e Mon Sep 17 00:00:00 2001 From: Gabriel Marinho <104592062+gmarinho2@users.noreply.github.com> Date: Sat, 30 Aug 2025 17:39:38 -0300 Subject: [PATCH 53/56] Fix wrong truncate_prompt_tokens type hint (#22761) Signed-off-by: Gabriel Marinho Signed-off-by: Gabriel Marinho <104592062+gmarinho2@users.noreply.github.com> Signed-off-by: Max de Bayser Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Max de Bayser --- vllm/entrypoints/llm.py | 41 +++++++--------- vllm/entrypoints/openai/protocol.py | 28 +++++++---- vllm/entrypoints/openai/serving_chat.py | 1 - .../openai/serving_classification.py | 13 ----- vllm/entrypoints/openai/serving_completion.py | 1 - vllm/entrypoints/openai/serving_embedding.py | 14 ------ vllm/entrypoints/openai/serving_engine.py | 47 ++++++++----------- vllm/entrypoints/openai/serving_pooling.py | 2 - vllm/entrypoints/openai/serving_score.py | 6 +-- vllm/inputs/preprocess.py | 22 ++++++++- vllm/pooling_params.py | 7 ++- vllm/sampling_params.py | 14 ++++-- vllm/transformers_utils/tokenizer_group.py | 1 + vllm/utils/__init__.py | 6 +++ 14 files changed, 101 insertions(+), 102 deletions(-) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 9d900e691b0a0..479524a117995 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -51,7 +51,7 @@ from vllm.tasks import PoolingTask from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer, get_cached_tokenizer) from vllm.usage.usage_lib import UsageContext -from vllm.utils import Counter, Device, is_list_of +from vllm.utils import Counter, Device, as_iter, is_list_of from vllm.v1.sample.logits_processor import LogitsProcessor if TYPE_CHECKING: @@ -364,14 +364,6 @@ class LLM: # Use default sampling params. sampling_params = self.get_default_sampling_params() - tokenization_kwargs: dict[str, Any] = {} - truncate_prompt_tokens = None - if isinstance(sampling_params, SamplingParams): - truncate_prompt_tokens = sampling_params.truncate_prompt_tokens - - _validate_truncation_size(model_config.max_model_len, - truncate_prompt_tokens, tokenization_kwargs) - # Add any modality specific loras to the corresponding prompts lora_request = self._get_modality_specific_lora_reqs( prompts, lora_request) @@ -381,7 +373,6 @@ class LLM: params=sampling_params, use_tqdm=use_tqdm, lora_request=lora_request, - tokenization_kwargs=tokenization_kwargs, priority=priority, ) @@ -871,6 +862,8 @@ class LLM: If `False`, no progress bar is created. lora_request: LoRA request to use for generation, if any. pooling_task: Override the pooling task to use. + tokenization_kwargs: overrides tokenization_kwargs set in + pooling_params Returns: A list of `PoolingRequestOutput` objects containing the @@ -916,24 +909,17 @@ class LLM: # Use default pooling params. pooling_params = PoolingParams() - if isinstance(pooling_params, PoolingParams): - pooling_params.verify(pooling_task, model_config) - else: - for pooling_param in pooling_params: - pooling_param.verify(pooling_task, model_config) - - if tokenization_kwargs is None: - tokenization_kwargs = dict[str, Any]() - _validate_truncation_size(model_config.max_model_len, - truncate_prompt_tokens, - tokenization_kwargs) + for param in as_iter(pooling_params): + param.verify(pooling_task, model_config) + # for backwards compatibility + if truncate_prompt_tokens is not None: + param.truncate_prompt_tokens = truncate_prompt_tokens self._validate_and_add_requests( prompts=prompts, params=pooling_params, use_tqdm=use_tqdm, lora_request=lora_request, - tokenization_kwargs=tokenization_kwargs, ) outputs = self._run_engine(use_tqdm=use_tqdm) @@ -1385,7 +1371,6 @@ class LLM: *, use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[Sequence[LoRARequest], LoRARequest]], - tokenization_kwargs: Optional[dict[str, Any]] = None, priority: Optional[list[int]] = None, ) -> None: if isinstance(prompts, (str, dict)): @@ -1412,7 +1397,17 @@ class LLM: tqdm_func = use_tqdm if callable(use_tqdm) else tqdm it = tqdm_func(it, desc="Adding requests") + model_config = self.llm_engine.model_config + for i, prompt in enumerate(it): + + param = params[i] if isinstance(params, Sequence) else params + + tokenization_kwargs: dict[str, Any] = {} + _validate_truncation_size(model_config.max_model_len, + param.truncate_prompt_tokens, + tokenization_kwargs) + self._add_request( prompt, params[i] if isinstance(params, Sequence) else params, diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 5cb41bd93d4bc..0fa1136b47b85 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -452,7 +452,7 @@ class ChatCompletionRequest(OpenAIBaseModel): min_tokens: int = 0 skip_special_tokens: bool = True spaces_between_special_tokens: bool = True - truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None + truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None prompt_logprobs: Optional[int] = None allowed_token_ids: Optional[list[int]] = None bad_words: list[str] = Field(default_factory=list) @@ -995,7 +995,7 @@ class CompletionRequest(OpenAIBaseModel): min_tokens: int = 0 skip_special_tokens: bool = True spaces_between_special_tokens: bool = True - truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None + truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None allowed_token_ids: Optional[list[int]] = None prompt_logprobs: Optional[int] = None # --8<-- [end:completion-sampling-params] @@ -1325,8 +1325,10 @@ class EmbeddingCompletionRequest(OpenAIBaseModel): # --8<-- [end:embedding-extra-params] def to_pooling_params(self): - return PoolingParams(dimensions=self.dimensions, - normalize=self.normalize) + return PoolingParams( + truncate_prompt_tokens=self.truncate_prompt_tokens, + dimensions=self.dimensions, + normalize=self.normalize) class EmbeddingChatRequest(OpenAIBaseModel): @@ -1393,8 +1395,10 @@ class EmbeddingChatRequest(OpenAIBaseModel): return data def to_pooling_params(self): - return PoolingParams(dimensions=self.dimensions, - normalize=self.normalize) + return PoolingParams( + truncate_prompt_tokens=self.truncate_prompt_tokens, + dimensions=self.dimensions, + normalize=self.normalize) EmbeddingRequest = Union[EmbeddingCompletionRequest, EmbeddingChatRequest] @@ -1430,7 +1434,9 @@ class ScoreRequest(OpenAIBaseModel): # --8<-- [end:score-extra-params] def to_pooling_params(self): - return PoolingParams(activation=self.activation) + return PoolingParams( + truncate_prompt_tokens=self.truncate_prompt_tokens, + activation=self.activation) class RerankRequest(OpenAIBaseModel): @@ -1460,7 +1466,9 @@ class RerankRequest(OpenAIBaseModel): # --8<-- [end:rerank-extra-params] def to_pooling_params(self): - return PoolingParams(activation=self.activation) + return PoolingParams( + truncate_prompt_tokens=self.truncate_prompt_tokens, + activation=self.activation) class RerankDocument(BaseModel): @@ -1618,7 +1626,9 @@ class ClassificationRequest(OpenAIBaseModel): # --8<-- [end:classification-extra-params] def to_pooling_params(self): - return PoolingParams(activation=self.activation) + return PoolingParams( + truncate_prompt_tokens=self.truncate_prompt_tokens, + activation=self.activation) class ClassificationData(OpenAIBaseModel): diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 1c0ffdfb91897..6300d0758c3d4 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -237,7 +237,6 @@ class OpenAIServingChat(OpenAIServing): documents=request.documents, chat_template_kwargs=request.chat_template_kwargs, tool_parser=tool_parser, - truncate_prompt_tokens=request.truncate_prompt_tokens, add_special_tokens=request.add_special_tokens, ) else: diff --git a/vllm/entrypoints/openai/serving_classification.py b/vllm/entrypoints/openai/serving_classification.py index 1d510d0b60a2d..b4fdc36390319 100644 --- a/vllm/entrypoints/openai/serving_classification.py +++ b/vllm/entrypoints/openai/serving_classification.py @@ -61,7 +61,6 @@ class ClassificationMixin(OpenAIServing): ctx.request, ctx.tokenizer, ctx.request.input, - truncate_prompt_tokens=ctx.request.truncate_prompt_tokens, ) return None @@ -157,18 +156,6 @@ class ServingClassification(ClassificationMixin): return await super().handle(ctx) # type: ignore - @override - def _validate_request( - self, - ctx: ClassificationServeContext, - ) -> Optional[ErrorResponse]: - if error := super()._validate_request(ctx): - return error - - ctx.truncate_prompt_tokens = ctx.request.truncate_prompt_tokens - - return None - @override def _create_pooling_params( self, diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index f461d7609b945..11effba8f9eb3 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -137,7 +137,6 @@ class OpenAIServingCompletion(OpenAIServing): request, tokenizer, request.prompt, - truncate_prompt_tokens=request.truncate_prompt_tokens, add_special_tokens=request.add_special_tokens, ) except ValueError as e: diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 45c1932f1873c..0a0d98db2d0d8 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -97,7 +97,6 @@ class EmbeddingMixin(OpenAIServing): # so there is no need to append extra tokens to the input add_generation_prompt=False, continue_final_message=False, - truncate_prompt_tokens=ctx.truncate_prompt_tokens, add_special_tokens=ctx.request.add_special_tokens, ) else: @@ -106,7 +105,6 @@ class EmbeddingMixin(OpenAIServing): ctx.request, tokenizer, ctx.request.input, - truncate_prompt_tokens=ctx.truncate_prompt_tokens, add_special_tokens=ctx.request.add_special_tokens, ) return None @@ -631,18 +629,6 @@ class OpenAIServingEmbedding(EmbeddingMixin): return await super().handle(ctx) # type: ignore - @override - def _validate_request( - self, - ctx: ServeContext[EmbeddingRequest], - ) -> Optional[ErrorResponse]: - if error := super()._validate_request(ctx): - return error - - ctx.truncate_prompt_tokens = ctx.request.truncate_prompt_tokens - - return None - @override def _create_pooling_params( self, diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index ca6f3987936da..320c1e61f1d13 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -165,7 +165,6 @@ class ServeContext(RequestProcessingMixin, ResponseGenerationMixin, BaseModel, # Shared across most requests tokenizer: Optional[AnyTokenizer] = None - truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None # `protected_namespaces` resolves Pydantic v2's warning # on conflict with protected namespace "model_" @@ -297,14 +296,12 @@ class OpenAIServing: truncate_prompt_tokens = getattr(ctx.request, "truncate_prompt_tokens", None) - if truncate_prompt_tokens is not None: - if truncate_prompt_tokens <= self.max_model_len: - ctx.truncate_prompt_tokens = truncate_prompt_tokens - else: - return self.create_error_response( - "truncate_prompt_tokens value is " - "greater than max_model_len." - " Please, select a smaller truncation size.") + if truncate_prompt_tokens is not None and \ + truncate_prompt_tokens > self.max_model_len: + return self.create_error_response( + "truncate_prompt_tokens value is " + "greater than max_model_len." + " Please, select a smaller truncation size.") return None def _create_pooling_params( @@ -528,7 +525,6 @@ class OpenAIServing: request: AnyRequest, prompt: str, tokenizer: AnyTokenizer, - truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]], add_special_tokens: bool, ) -> TextTokensPrompt: async_tokenizer = self._get_async_tokenizer(tokenizer) @@ -538,6 +534,9 @@ class OpenAIServing: "do_lower_case", False)): prompt = prompt.lower() + truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens", + None) + if truncate_prompt_tokens is None: encoded = await async_tokenizer( prompt, add_special_tokens=add_special_tokens) @@ -565,8 +564,10 @@ class OpenAIServing: request: AnyRequest, prompt_ids: list[int], tokenizer: Optional[AnyTokenizer], - truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]], ) -> TextTokensPrompt: + truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens", + None) + if truncate_prompt_tokens is None: input_ids = prompt_ids elif truncate_prompt_tokens < 0: @@ -652,7 +653,6 @@ class OpenAIServing: request: AnyRequest, tokenizer: AnyTokenizer, prompt_input: Union[str, list[int]], - truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None, add_special_tokens: bool = True, ) -> TextTokensPrompt: """ @@ -664,7 +664,6 @@ class OpenAIServing: request, tokenizer, [prompt_input], - truncate_prompt_tokens=truncate_prompt_tokens, add_special_tokens=add_special_tokens, ): return result @@ -675,7 +674,6 @@ class OpenAIServing: request: AnyRequest, tokenizer: AnyTokenizer, prompt_inputs: Iterable[Union[str, list[int]]], - truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None, add_special_tokens: bool = True, ) -> AsyncGenerator[TextTokensPrompt, None]: """ @@ -689,7 +687,6 @@ class OpenAIServing: request, prompt=prompt, tokenizer=tokenizer, - truncate_prompt_tokens=truncate_prompt_tokens, add_special_tokens=add_special_tokens, ) else: @@ -697,7 +694,6 @@ class OpenAIServing: request, prompt_ids=prompt, tokenizer=tokenizer, - truncate_prompt_tokens=truncate_prompt_tokens, ) async def _tokenize_prompt_input_or_inputs_async( @@ -706,7 +702,6 @@ class OpenAIServing: tokenizer: Optional[AnyTokenizer], input_or_inputs: Optional[Union[str, list[str], list[int], list[list[int]]]], - truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None, add_special_tokens: bool = True, ) -> tuple[list[TextTokensPrompt], list[EmbedsPrompt]]: """ @@ -719,6 +714,12 @@ class OpenAIServing: inputs_embeds = list[EmbedsPrompt]() inputs_text = list[TextTokensPrompt]() + truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens", + None) + + if (truncate_prompt_tokens or 0) < 0: + truncate_prompt_tokens = self.max_model_len + if (isinstance(request, CompletionRequest) and request.prompt_embeds is not None): inputs_embeds.extend( @@ -748,14 +749,10 @@ class OpenAIServing: request, prompt_input["content"], tokenizer=tokenizer, - truncate_prompt_tokens=truncate_prompt_tokens, add_special_tokens=add_special_tokens) else: task = self._normalize_prompt_tokens_to_input( - request, - prompt_input["content"], - tokenizer=tokenizer, - truncate_prompt_tokens=truncate_prompt_tokens) + request, prompt_input["content"], tokenizer=tokenizer) tasks.append(task) # Wait for all tokenization tasks to complete @@ -772,7 +769,6 @@ class OpenAIServing: TokenizeCompletionRequest], tokenizer: Optional[AnyTokenizer], input_or_inputs: Union[str, list[str], list[int], list[list[int]]], - truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = ..., add_special_tokens: bool = ..., ) -> tuple[list[TextTokensPrompt], list[EngineTokensPrompt]]: ... @@ -784,7 +780,6 @@ class OpenAIServing: tokenizer: Optional[AnyTokenizer], input_or_inputs: Optional[Union[str, list[str], list[int], list[list[int]]]], - truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = ..., add_special_tokens: bool = ..., ) -> tuple[list[Union[TextTokensPrompt, EmbedsPrompt]], list[Union[ EngineTokensPrompt, EngineEmbedsPrompt]]]: @@ -796,7 +791,6 @@ class OpenAIServing: tokenizer: Optional[AnyTokenizer], input_or_inputs: Optional[Union[str, list[str], list[int], list[list[int]]]], - truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None, add_special_tokens: bool = True, ) -> tuple[Union[list[TextTokensPrompt], list[Union[ TextTokensPrompt, EmbedsPrompt]]], Union[ @@ -813,7 +807,6 @@ class OpenAIServing: request, tokenizer, input_or_inputs, - truncate_prompt_tokens=truncate_prompt_tokens, add_special_tokens=add_special_tokens, ) @@ -866,7 +859,6 @@ class OpenAIServing: documents: Optional[list[dict[str, str]]] = None, chat_template_kwargs: Optional[dict[str, Any]] = None, tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None, - truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None, add_special_tokens: bool = False, ) -> tuple[list[ConversationMessage], Sequence[RequestPrompt], list[EngineTokensPrompt]]: @@ -941,7 +933,6 @@ class OpenAIServing: request, tokenizer, request_prompt, - truncate_prompt_tokens=truncate_prompt_tokens, add_special_tokens=add_special_tokens, ) else: diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py index e8cb1aed84596..b2c2af2ec58e0 100644 --- a/vllm/entrypoints/openai/serving_pooling.py +++ b/vllm/entrypoints/openai/serving_pooling.py @@ -120,7 +120,6 @@ class OpenAIServingPooling(OpenAIServing): # so there is no need to append extra tokens to the input add_generation_prompt=False, continue_final_message=False, - truncate_prompt_tokens=truncate_prompt_tokens, add_special_tokens=request.add_special_tokens, ) else: @@ -129,7 +128,6 @@ class OpenAIServingPooling(OpenAIServing): request, tokenizer, request.input, - truncate_prompt_tokens=truncate_prompt_tokens, add_special_tokens=request.add_special_tokens, ) except (ValueError, TypeError, jinja2.TemplateError) as e: diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index c54deb371d545..847c014a11dc3 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -266,12 +266,14 @@ class ServingScores(OpenAIServing): request: Union[ScoreRequest, RerankRequest], request_id: str, raw_request: Optional[Request] = None, - truncate_prompt_tokens: Optional[int] = None, ) -> Union[list[PoolingRequestOutput], ErrorResponse]: lora_request = self._maybe_get_adapters(request) tokenizer = await self.engine_client.get_tokenizer(lora_request) + truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens", + None) + tokenization_kwargs: dict[str, Any] = {} _validate_truncation_size(self.max_model_len, truncate_prompt_tokens, tokenization_kwargs) @@ -343,7 +345,6 @@ class ServingScores(OpenAIServing): request, request_id, raw_request, - request.truncate_prompt_tokens, ) if isinstance(final_res_batch, ErrorResponse): return final_res_batch @@ -391,7 +392,6 @@ class ServingScores(OpenAIServing): request, request_id, raw_request, - request.truncate_prompt_tokens, ) if isinstance(final_res_batch, ErrorResponse): return final_res_batch diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 3dbd9057fe0f7..2f2fbe274bf07 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -346,6 +346,22 @@ class InputPreprocessor: ) -> EmbedsInputs: return self._process_embeds(parsed_content) + def _truncate_inputs( + self, + inputs: list[int], + tokenization_kwargs: Optional[dict[str, Any]] = None) -> list[int]: + + if not tokenization_kwargs or "truncation" not in \ + tokenization_kwargs or self.tokenizer is None: + return inputs + + max_length = tokenization_kwargs["max_length"] + + if self.tokenizer.truncation_side == "left": + return inputs[-max_length:] + else: + return inputs[:max_length] + def _process_tokens( self, parsed_content: TokensPrompt, @@ -354,7 +370,8 @@ class InputPreprocessor: *, mm_hash_overrides: Optional[dict[str, list[str]]] = None, ) -> Union[TokenInputs, MultiModalInputs]: - prompt_token_ids = parsed_content["prompt_token_ids"] + prompt_token_ids = self._truncate_inputs( + parsed_content["prompt_token_ids"], tokenization_kwargs) inputs: Union[TokenInputs, MultiModalInputs] if multi_modal_data := parsed_content.get("multi_modal_data"): @@ -382,7 +399,8 @@ class InputPreprocessor: *, mm_hash_overrides: Optional[dict[str, list[str]]] = None, ) -> Union[TokenInputs, MultiModalInputs]: - prompt_token_ids = parsed_content["prompt_token_ids"] + prompt_token_ids = self._truncate_inputs( + parsed_content["prompt_token_ids"], tokenization_kwargs) inputs: Union[TokenInputs, MultiModalInputs] if multi_modal_data := parsed_content.get("multi_modal_data"): diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py index 29f037b4372cd..6672392b8d080 100644 --- a/vllm/pooling_params.py +++ b/vllm/pooling_params.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from copy import deepcopy -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Annotated, Any, Optional import msgspec @@ -27,6 +27,11 @@ class PoolingParams( the classification outputs. softmax: Whether to apply softmax to the reward outputs. """ + truncate_prompt_tokens: Optional[Annotated[int, + msgspec.Meta(ge=-1)]] = None + """If set to -1, will use the truncation size supported by the model. If + set to an integer k, will use only the last k tokens from the prompt + (i.e., left truncation). If set to `None`, truncation is disabled.""" ## for embeddings models dimensions: Optional[int] = None diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index df4cca9ba1147..c7b4ba34c602e 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -182,7 +182,8 @@ class SamplingParams( optionally prompt tokens as a first argument.""" include_stop_str_in_output: bool = False """Whether to include the stop strings in output text.""" - truncate_prompt_tokens: Optional[Annotated[int, msgspec.Meta(ge=1)]] = None + truncate_prompt_tokens: Optional[Annotated[int, + msgspec.Meta(ge=-1)]] = None """If set to -1, will use the truncation size supported by the model. If set to an integer k, will use only the last k tokens from the prompt (i.e., left truncation). If set to `None`, truncation is disabled.""" @@ -241,7 +242,8 @@ class SamplingParams( spaces_between_special_tokens: bool = True, logits_processors: Optional[list[LogitsProcessor]] = None, truncate_prompt_tokens: Optional[Annotated[int, - msgspec.Meta(ge=1)]] = None, + msgspec.Meta( + ge=-1)]] = None, output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE, guided_decoding: Optional[GuidedDecodingParams] = None, logit_bias: Optional[Union[dict[int, float], dict[str, float]]] = None, @@ -411,9 +413,11 @@ class SamplingParams( raise ValueError(f"prompt_logprobs must be non-negative, got " f"{self.prompt_logprobs}.") if (self.truncate_prompt_tokens is not None - and self.truncate_prompt_tokens < 1): - raise ValueError(f"truncate_prompt_tokens must be >= 1, " - f"got {self.truncate_prompt_tokens}") + and (self.truncate_prompt_tokens == 0 + or self.truncate_prompt_tokens < -1)): + raise ValueError( + f"truncate_prompt_tokens must be an integer >= 1 or -1, " + f"got {self.truncate_prompt_tokens}") assert isinstance(self.stop_token_ids, list) if not all(isinstance(st_id, int) for st_id in self.stop_token_ids): raise ValueError(f"stop_token_ids must contain only integers, " diff --git a/vllm/transformers_utils/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group.py index a8bb0398dfdb1..ae8220f9b9dc5 100644 --- a/vllm/transformers_utils/tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group.py @@ -23,6 +23,7 @@ class TokenizerGroup: self.tokenizer_config = tokenizer_config self.enable_lora = enable_lora self.max_input_length = max_input_length + self.truncation_side = tokenizer_config.get("truncation_side", "left") self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config) max_loras = tokenizer_config.get("max_loras", 0) self.lora_tokenizers = LRUCache[int, AnyTokenizer]( diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index c5ed10326fd50..698aaab3aaa02 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -1328,6 +1328,12 @@ def as_list(maybe_list: Iterable[T]) -> list[T]: return maybe_list if isinstance(maybe_list, list) else list(maybe_list) +def as_iter(obj: Union[T, Iterable[T]]) -> Iterable[T]: + if isinstance(obj, str) or not isinstance(obj, Iterable): + obj = [obj] + return obj + + # `collections` helpers def is_list_of( value: object, From 749be00a98eef8eab262cc3119893c00dbca22e9 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sat, 30 Aug 2025 18:01:22 -0700 Subject: [PATCH 54/56] [Core][Multimodal] Allow passing `multi_modal_uuids` as multimodal identifiers. (#23394) Signed-off-by: Roger Wang --- docs/features/multimodal_inputs.md | 35 +++ .../test_processor_multi_modal_uuids.py | 229 ++++++++++++++++++ vllm/entrypoints/openai/serving_engine.py | 2 +- vllm/inputs/data.py | 20 +- vllm/inputs/preprocess.py | 44 ++-- vllm/multimodal/__init__.py | 6 +- vllm/multimodal/hasher.py | 7 +- vllm/multimodal/inputs.py | 15 +- vllm/multimodal/processing.py | 91 +++++-- vllm/v1/engine/processor.py | 60 ++++- 10 files changed, 455 insertions(+), 54 deletions(-) create mode 100644 tests/v1/engine/test_processor_multi_modal_uuids.py diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index 9d51f9cf52f50..206ab7a468755 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -13,6 +13,41 @@ To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]: - `prompt`: The prompt should follow the format that is documented on HuggingFace. - `multi_modal_data`: This is a dictionary that follows the schema defined in [vllm.multimodal.inputs.MultiModalDataDict][]. +### Stable UUIDs for Caching (multi_modal_uuids) + +When using multi-modal inputs, vLLM normally hashes each media item by content to enable caching across requests. You can optionally pass `multi_modal_uuids` to provide your own stable IDs for each item so caching can reuse work across requests without rehashing the raw content. + +??? code + + ```python + from vllm import LLM + from PIL import Image + + # Qwen2.5-VL example with two images + llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct") + + prompt = "USER: \nDescribe the differences.\nASSISTANT:" + img_a = Image.open("/path/to/a.jpg") + img_b = Image.open("/path/to/b.jpg") + + outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": {"image": [img_a, img_b]}, + # Provide stable IDs for caching. + # Requirements (matched by this example): + # - Include every modality present in multi_modal_data. + # - For lists, provide the same number of entries. + # - Use None to fall back to content hashing for that item. + "multi_modal_uuids": {"image": ["sku-1234-a", None]}, + }) + + for o in outputs: + print(o.outputs[0].text) + ``` + +!!! warning + If both multimodal processor caching and prefix caching are disabled, user-provided `multi_modal_uuids` are ignored. + ### Image Inputs You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples: diff --git a/tests/v1/engine/test_processor_multi_modal_uuids.py b/tests/v1/engine/test_processor_multi_modal_uuids.py new file mode 100644 index 0000000000000..970a59eca8ece --- /dev/null +++ b/tests/v1/engine/test_processor_multi_modal_uuids.py @@ -0,0 +1,229 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest + +from vllm.assets.image import ImageAsset +from vllm.assets.video import VideoAsset +from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig +from vllm.platforms.interface import UnspecifiedPlatform +from vllm.sampling_params import SamplingParams +from vllm.v1.engine import processor as processor_mod +from vllm.v1.engine.processor import Processor + +cherry_pil_image = ImageAsset("cherry_blossom").pil_image +stop_pil_image = ImageAsset("stop_sign").pil_image +baby_reading_np_ndarrays = VideoAsset("baby_reading").np_ndarrays + + +# Mock processor for testing +def _mk_processor(monkeypatch, + *, + mm_cache_gb: float = 4.0, + enable_prefix_caching: bool = True) -> Processor: + """ + Create a Processor instance with minimal configuration suitable for unit + tests without accessing external resources. + """ + monkeypatch.setattr(ModelConfig, + "try_get_generation_config", + lambda self: {}, + raising=True) + monkeypatch.setattr(ModelConfig, + "__post_init__", + lambda self: None, + raising=True) + monkeypatch.setattr(UnspecifiedPlatform, + "is_async_output_supported", + classmethod(lambda cls, enforce_eager: True), + raising=True) + monkeypatch.setattr( + ModelConfig, + "verify_async_output_proc", + lambda self, parallel_config, speculative_config, device_config: None, + raising=True) + monkeypatch.setattr(ModelConfig, + "verify_with_parallel_config", + lambda self, parallel_config: None, + raising=True) + monkeypatch.setattr(processor_mod, + "processor_cache_from_config", + lambda vllm_config, mm_registry: None, + raising=True) + + monkeypatch.setattr(VllmConfig, + "__post_init__", + lambda self: None, + raising=True) + + model_config = ModelConfig( + skip_tokenizer_init=True, + max_model_len=128, + mm_processor_cache_gb=mm_cache_gb, + generation_config="vllm", + tokenizer="dummy", + ) + + # Minimal multimodal_config to satisfy references in + # Processor.process_inputs. + class _MockMMConfig: + + def __init__(self, gb: float): + self.mm_processor_cache_gb = gb + + model_config.multimodal_config = _MockMMConfig( + mm_cache_gb) # type: ignore[attr-defined] + vllm_config = VllmConfig( + model_config=model_config, + cache_config=CacheConfig(enable_prefix_caching=enable_prefix_caching), + device_config=DeviceConfig(device="cpu"), + ) + + # Pass tokenizer=None; InputPreprocessor handles None when + # skip_tokenizer_init is True. + return Processor(vllm_config, tokenizer=None) # type: ignore[arg-type] + + +def test_multi_modal_uuids_length_mismatch_raises(monkeypatch): + processor = _mk_processor(monkeypatch) + + prompt = { + "prompt": "USER: \nDescribe\nASSISTANT:", + "multi_modal_data": { + "image": [cherry_pil_image, stop_pil_image] + }, + # Mismatch: 2 items but only 1 uuid provided + "multi_modal_uuids": { + "image": ["hash_cherry"] + }, + } + + with pytest.raises(ValueError, match="must have same length as data"): + processor.process_inputs( + request_id="req-1", + prompt=prompt, # type: ignore[arg-type] + params=SamplingParams(), + ) + + +def test_multi_modal_uuids_missing_modality_raises(monkeypatch): + processor = _mk_processor(monkeypatch) + + prompt = { + "prompt": "USER: