From d9d342d214b8c13f71215318a6d9252cc4a5ca47 Mon Sep 17 00:00:00 2001 From: Pleaplusone Date: Wed, 26 Nov 2025 12:45:28 +0800 Subject: [PATCH 001/197] [Performance][MLA][ROCm] Remove redundant D2D copy in deepseek (#27457) Signed-off-by: ganyi --- csrc/attention/merge_attn_states.cu | 27 +++++++-------- csrc/ops.h | 3 +- csrc/torch_bindings.cpp | 3 +- .../attention/ops/triton_merge_attn_states.py | 23 +++++++++---- vllm/v1/attention/backends/mla/common.py | 34 ++++++++++--------- 5 files changed, 49 insertions(+), 41 deletions(-) diff --git a/csrc/attention/merge_attn_states.cu b/csrc/attention/merge_attn_states.cu index 229d9862fb670..27d1e990c611e 100644 --- a/csrc/attention/merge_attn_states.cu +++ b/csrc/attention/merge_attn_states.cu @@ -16,7 +16,8 @@ __global__ void merge_attn_states_kernel( scalar_t* output, float* output_lse, const scalar_t* prefix_output, const float* prefix_lse, const scalar_t* suffix_output, const float* suffix_lse, const uint num_tokens, const uint num_heads, - const uint head_size) { + const uint head_size, const uint prefix_head_stride, + const uint output_head_stride) { using pack_128b_t = uint4; const uint pack_size = 16 / sizeof(scalar_t); const uint threads_per_head = head_size / pack_size; @@ -34,11 +35,13 @@ __global__ void merge_attn_states_kernel( const uint head_idx = token_head_idx % num_heads; const uint pack_offset = pack_idx * pack_size; // (0~15)*8, etc. - const uint head_offset = - token_idx * num_heads * head_size + head_idx * head_size; - const scalar_t* prefix_head_ptr = prefix_output + head_offset; - const scalar_t* suffix_head_ptr = suffix_output + head_offset; - scalar_t* output_head_ptr = output + head_offset; + const uint src_head_offset = token_idx * num_heads * prefix_head_stride + + head_idx * prefix_head_stride; + const uint dst_head_offset = token_idx * num_heads * output_head_stride + + head_idx * output_head_stride; + const scalar_t* prefix_head_ptr = prefix_output + src_head_offset; + const scalar_t* suffix_head_ptr = suffix_output + src_head_offset; + scalar_t* output_head_ptr = output + dst_head_offset; float p_lse = prefix_lse[head_idx * num_tokens + token_idx]; float s_lse = suffix_lse[head_idx * num_tokens + token_idx]; @@ -140,7 +143,7 @@ __global__ void merge_attn_states_kernel( reinterpret_cast(prefix_lse.data_ptr()), \ reinterpret_cast(suffix_output.data_ptr()), \ reinterpret_cast(suffix_lse.data_ptr()), num_tokens, \ - num_heads, head_size); \ + num_heads, head_size, prefix_head_stride, output_head_stride); \ } /*@brief Merges the attention states from prefix and suffix @@ -166,17 +169,11 @@ void merge_attn_states_launcher(torch::Tensor& output, const uint num_tokens = output.size(0); const uint num_heads = output.size(1); const uint head_size = output.size(2); + const uint prefix_head_stride = prefix_output.stride(1); + const uint output_head_stride = output.stride(1); const uint pack_size = 16 / sizeof(scalar_t); TORCH_CHECK(head_size % pack_size == 0, "headsize must be multiple of pack_size:", pack_size); - TORCH_CHECK(output.stride(-2) == head_size && output.stride(-1) == 1, - "output heads must be contiguous in memory"); - TORCH_CHECK( - prefix_output.stride(-2) == head_size && prefix_output.stride(-1) == 1, - "prefix_output heads must be contiguous in memory"); - TORCH_CHECK( - suffix_output.stride(-2) == head_size && suffix_output.stride(-1) == 1, - "suffix_output heads must be contiguous in memory"); float* output_lse_ptr = nullptr; if (output_lse.has_value()) { output_lse_ptr = output_lse.value().data_ptr(); diff --git a/csrc/ops.h b/csrc/ops.h index f8bdc61aaa8ec..4bb7857b15032 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -52,14 +52,13 @@ void paged_attention_v2( const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, const int64_t blocksparse_head_sliding_step); -#ifndef USE_ROCM void merge_attn_states(torch::Tensor& output, std::optional output_lse, const torch::Tensor& prefix_output, const torch::Tensor& prefix_lse, const torch::Tensor& suffix_output, const torch::Tensor& suffix_lse); - +#ifndef USE_ROCM void convert_vertical_slash_indexes( torch::Tensor& block_count, // [BATCH, N_HEADS, NUM_ROWS] torch::Tensor& block_offset, // [BATCH, N_HEADS, NUM_ROWS, NNZ_S] diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 14913bef13125..e9c96bb8b56cf 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -63,7 +63,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " int blocksparse_head_sliding_step) -> ()"); ops.impl("paged_attention_v2", torch::kCUDA, &paged_attention_v2); -#ifndef USE_ROCM // Merge attn states // Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005 // can be used to combine partial attention results (in the split-KV case) @@ -76,7 +75,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor suffix_output," " Tensor suffix_lse) -> ()"); ops.impl("merge_attn_states", torch::kCUDA, &merge_attn_states); - +#ifndef USE_ROCM ops.def( "convert_vertical_slash_indexes(" " Tensor! block_count, Tensor! block_offset, " diff --git a/vllm/attention/ops/triton_merge_attn_states.py b/vllm/attention/ops/triton_merge_attn_states.py index 3c87a24afd9c7..74e4d778ded87 100644 --- a/vllm/attention/ops/triton_merge_attn_states.py +++ b/vllm/attention/ops/triton_merge_attn_states.py @@ -20,7 +20,11 @@ def merge_attn_states( num_query_heads = output.shape[1] head_size = output.shape[2] padded_head_size = triton.next_power_of_2(head_size) - + # We assume the output stride on num_head is not always as same as the + # `suffix_output` and `prefix_output`, as them might be padded by the attention + # backend. + prefix_head_stride = prefix_output.stride(1) + output_head_stride = output.stride(1) # TODO(woosuk): Use CUDA kernel instead of Triton to minimize CPU overhead. merge_attn_states_kernel[(num_tokens, num_query_heads)]( output, @@ -29,6 +33,8 @@ def merge_attn_states( prefix_lse, suffix_output, suffix_lse, + prefix_head_stride, + output_head_stride, head_size, padded_head_size, output_lse is not None, @@ -43,6 +49,8 @@ def merge_attn_states_kernel( prefix_lse, # [NUM_HEADS, NUM_TOKENS] suffix_output, # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE] suffix_lse, # [NUM_HEADS, NUM_TOKENS] + prefix_head_stride, + output_head_stride, HEAD_SIZE: tl.constexpr, PADDED_HEAD_SIZE: tl.constexpr, OUTPUT_LSE: tl.constexpr, @@ -79,15 +87,15 @@ def merge_attn_states_kernel( head_mask = head_arange < HEAD_SIZE p_out = tl.load( prefix_output - + token_idx * num_heads * HEAD_SIZE - + head_idx * HEAD_SIZE + + token_idx * num_heads * prefix_head_stride + + head_idx * prefix_head_stride + head_arange, mask=head_mask, ) s_out = tl.load( suffix_output - + token_idx * num_heads * HEAD_SIZE - + head_idx * HEAD_SIZE + + token_idx * num_heads * prefix_head_stride + + head_idx * prefix_head_stride + head_arange, mask=head_mask, ) @@ -99,7 +107,10 @@ def merge_attn_states_kernel( s_scale = s_se / out_se out = p_out * p_scale + s_out * s_scale tl.store( - output + token_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE + head_arange, + output + + token_idx * num_heads * output_head_stride + + head_idx * output_head_stride + + head_arange, out, mask=head_mask, ) diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 87a3aac21d2c3..d94ed9183f639 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -1238,15 +1238,13 @@ class MLACommonBaseImpl(MLAAttentionImpl[A], Generic[A]): def _v_up_proj(self, x: torch.Tensor, out: torch.Tensor): # Convert from (B, N, L) to (N, B, L) x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1) + if self.is_aiter_triton_fp8_bmm_enabled: + out = out.view(-1, self.num_heads, self.v_head_dim) # Multiply + Transpose (N, B, L) x (N, L, V)->(N, B, V)->(B, N, V) x = rocm_aiter_ops.triton_fp8_bmm( - x, self.W_V, self.W_V_scale, group_size=128, transpose_bm=True + x, self.W_V, self.W_V_scale, group_size=128, transpose_bm=True, YQ=out ) - # Convert from (B, N, V) to (B, N * V) - x = x.reshape(-1, self.num_heads * self.v_head_dim) - # Copy result - out.copy_(x) else: # Convert from (B, N * V) to (N, B, V) out = out.view(-1, self.num_heads, self.v_head_dim).transpose(0, 1) @@ -1824,7 +1822,8 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): kv_c_and_k_pe_cache: torch.Tensor, attn_metadata: MLACommonMetadata, k_scale: torch.Tensor, - ) -> torch.Tensor: + output: torch.Tensor, + ) -> None: # TODO (zyongye): Prefill function here assert attn_metadata.prefill is not None assert self.dcp_world_size is not None @@ -1837,7 +1836,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1) - output = self._run_prefill_new_tokens( + output_prefill = self._run_prefill_new_tokens( prefill=attn_metadata.prefill, q=q, k=k, @@ -1846,7 +1845,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): ) if has_context: - suffix_output, suffix_lse = output + suffix_output, suffix_lse = output_prefill if self.dcp_world_size > 1: context_output, context_lse = ( self._context_parallel_compute_prefill_context( @@ -1862,7 +1861,12 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): q, kv_c_and_k_pe_cache, attn_metadata, k_scale ) - output = torch.empty_like(suffix_output) + # unpad if necessary + if self._pad_v: + context_output = context_output[..., : v.shape[-1]] + suffix_output = suffix_output[..., : v.shape[-1]] + + output = output.view(-1, self.num_heads, self.v_head_dim) merge_attn_states( output=output, prefix_output=context_output, @@ -1870,12 +1874,9 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): suffix_output=suffix_output, suffix_lse=suffix_lse, ) - - # unpad if necessary - if self._pad_v: - output = output[..., : v.shape[-1]] - - return output.flatten(start_dim=-2) + else: + output_prefill = output_prefill[..., : v.shape[-1]].flatten(start_dim=-2) + output.copy_(output_prefill) @abstractmethod def _forward_decode( @@ -1970,13 +1971,14 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): kv_cache = kv_cache.view(current_platform.fp8_dtype()) if has_prefill: - output[num_decode_tokens:] = self._forward_prefill( + self._forward_prefill( prefill_q, prefill_k_c_normed, prefill_k_pe, kv_cache, attn_metadata, layer._k_scale, + output=output[num_decode_tokens:], ) if has_decode: From 452a7c9f7c949cd20c3c0c81cd4352b2a0045076 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Wed, 26 Nov 2025 05:00:00 -0800 Subject: [PATCH 002/197] [Misc] Allow LM only loading for Pixtral (#29451) Signed-off-by: Roger Wang --- vllm/model_executor/models/pixtral.py | 73 +++++++++++++++++++-------- 1 file changed, 51 insertions(+), 22 deletions(-) diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 6011d93a795d1..3464de472add5 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -400,21 +400,30 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP) prefix=maybe_prefix(prefix, "language_model"), ) - self.vision_encoder = VisionTransformer(self.vision_args) - - if self.vision_args.add_pre_mm_projector_layer_norm: - self.pre_mm_projector_norm = RMSNorm(self.vision_args.hidden_size, eps=1e-5) - - if self.vision_args.mm_projector_id == PATCH_MERGE: - self.patch_merger = PatchMerger( - vision_encoder_dim=self.vision_args.hidden_size, - spatial_merge_size=self.vision_args.spatial_merge_size, - use_mlp_bias=False, + if multimodal_config.get_limit_per_prompt("image"): + self.vision_encoder = VisionTransformer(self.vision_args) + self.pre_mm_projector_norm = ( + RMSNorm(self.vision_args.hidden_size, eps=1e-5) + if self.vision_args.add_pre_mm_projector_layer_norm + else None ) - - self.vision_language_adapter = VisionLanguageAdapter( - self.vision_args, dim=config.text_config.hidden_size - ) + self.patch_merger = ( + PatchMerger( + vision_encoder_dim=self.vision_args.hidden_size, + spatial_merge_size=self.vision_args.spatial_merge_size, + use_mlp_bias=False, + ) + if self.vision_args.mm_projector_id == PATCH_MERGE + else None + ) + self.vision_language_adapter = VisionLanguageAdapter( + self.vision_args, dim=config.text_config.hidden_size + ) + else: + self.vision_encoder = None + self.pre_mm_projector_norm = None + self.patch_merger = None + self.vision_language_adapter = None self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors @@ -436,13 +445,17 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP) self, image_input: PixtralImagePixelInputs, ) -> tuple[torch.Tensor, ...]: + assert ( + self.vision_encoder is not None and self.vision_language_adapter is not None + ) + images = image_input["images"] image_features = self.vision_encoder(images) feature_sizes = [image_feature.shape[0] for image_feature in image_features] image_features = torch.cat(image_features) - if self.vision_args.add_pre_mm_projector_layer_norm: + if self.pre_mm_projector_norm is not None: image_features = self.pre_mm_projector_norm(image_features) - if self.vision_args.mm_projector_id == PATCH_MERGE: + if self.patch_merger is not None: patch_size = self.vision_args.patch_size spatial_merge_size_square = self.vision_args.spatial_merge_size**2 img_patch_dims = [ @@ -508,41 +521,57 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP) return weight[0].startswith("pre_mm_projector_norm") # Get references to parameters for direct loading - vision_encoder_dict = dict(self.vision_encoder.named_parameters()) + vision_encoder_dict = ( + dict(self.vision_encoder.named_parameters()) + if self.vision_encoder is not None + else {} + ) patch_merger_dict = ( dict(self.patch_merger.named_parameters()) - if self.vision_args.mm_projector_id == PATCH_MERGE - else dict() + if self.patch_merger is not None + else {} ) pre_mm_projector_norm_dict = ( dict(self.pre_mm_projector_norm.named_parameters()) - if self.vision_args.add_pre_mm_projector_layer_norm - else dict() + if self.pre_mm_projector_norm is not None + else {} + ) + vision_lang_adapter_dict = ( + dict(self.vision_language_adapter.named_parameters()) + if self.vision_language_adapter is not None + else {} ) - vision_lang_adapter_dict = dict(self.vision_language_adapter.named_parameters()) def llm_weights_generator(): # Single pass over weights for name, w in weights: if is_vision_encoder_weights((name, w)): + if self.vision_encoder is None: + continue # Load vision encoder weights directly trimmed_name = ".".join(name.split(".")[1:]) param = vision_encoder_dict[trimmed_name] with torch.no_grad(): default_weight_loader(param, w) elif is_patch_merger((name, w)): + if self.patch_merger is None: + continue # Load vision patch merger weights directly trimmed_name = ".".join(name.split(".")[1:]) param = patch_merger_dict[trimmed_name] with torch.no_grad(): default_weight_loader(param, w) elif is_pre_mm_projector_norm((name, w)): + if self.pre_mm_projector_norm is None: + continue # Load vision pre_mm_projector_norm weights directly trimmed_name = ".".join(name.split(".")[1:]) param = pre_mm_projector_norm_dict[trimmed_name] with torch.no_grad(): default_weight_loader(param, w) elif is_vision_lang_adapter_weights((name, w)): + if self.vision_language_adapter is None: + continue # Load vision-language adapter weights directly trimmed_name = ".".join(name.split(".")[1:]) param = vision_lang_adapter_dict[trimmed_name] From e30859dff3d93bd3e289f6e996afbb59ac475b72 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 26 Nov 2025 21:00:15 +0800 Subject: [PATCH 003/197] [Bugfix] Fix handling of image embeds in models (#29480) Signed-off-by: DarkLight1337 --- vllm/model_executor/models/deepseek_vl2.py | 15 ++------------- vllm/model_executor/models/llava_next.py | 2 +- vllm/model_executor/models/llava_onevision.py | 2 +- 3 files changed, 4 insertions(+), 15 deletions(-) diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index e7b48e0f4e554..1b6e4110039c4 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -48,7 +48,6 @@ from vllm.transformers_utils.configs.deepseek_vl2 import ( ) from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config -from vllm.utils.collection_utils import is_list_of from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.utils.torch_utils import set_default_torch_dtype @@ -595,19 +594,9 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): def _process_image_input( self, image_input: DeepseekVL2ImageInputs - ) -> list[torch.Tensor]: + ) -> torch.Tensor | list[torch.Tensor]: if image_input["type"] == "image_embeds": - image_data = image_input["data"] - if is_list_of(image_data, torch.Tensor): - # it's already a list of tensors - return image_data - if len(image_data.shape) == 3: - # 3D tensor - return list(torch.unbind(image_data, dim=0)) - raise ValueError( - "We expect batched 2D tensors; " - "this can be either a list of 2D tensors or a single 3D tensor." - ) + return image_input["data"] pixel_values = image_input["data"] images_spatial_crop = image_input["images_spatial_crop"] diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 98b1b46045c3d..b995cac47ac1c 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -460,7 +460,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP image_input: LlavaNextImageInputs, ) -> torch.Tensor | list[torch.Tensor]: if image_input["type"] == "image_embeds": - return [image_input["data"]] + return image_input["data"] patch_embeddings = self._process_image_pixels(image_input) diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 322bde94ff66d..4e243ade68358 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -763,7 +763,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, Supp image_input: LlavaOnevisionImageInputs, ) -> torch.Tensor | list[torch.Tensor]: if image_input["type"] == "image_embeds": - return [image_input["data"]] + return image_input["data"] patch_embeddings = self._process_image_pixels(image_input) From bb706d60482233e2feb6bab894492e394dcdef94 Mon Sep 17 00:00:00 2001 From: Yejing Lai Date: Wed, 26 Nov 2025 21:15:00 +0800 Subject: [PATCH 004/197] Fix TeleChatForCausalLM not register issue (#29473) Signed-off-by: Lai, Yejing --- tests/models/registry.py | 3 +++ vllm/model_executor/models/registry.py | 2 ++ 2 files changed, 5 insertions(+) diff --git a/tests/models/registry.py b/tests/models/registry.py index f8b3470e6d39b..c9d4823d52792 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -436,6 +436,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "SolarForCausalLM": _HfExamplesInfo( "upstage/solar-pro-preview-instruct", trust_remote_code=True ), + "TeleChatForCausalLM": _HfExamplesInfo( + "chuhac/TeleChat2-35B", trust_remote_code=True + ), "TeleChat2ForCausalLM": _HfExamplesInfo( "Tele-AI/TeleChat2-3B", trust_remote_code=True ), diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 53644f9cb8788..ba9f33819c950 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -170,6 +170,7 @@ _TEXT_GENERATION_MODELS = { "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"), "Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"), "SolarForCausalLM": ("solar", "SolarForCausalLM"), + "TeleChatForCausalLM": ("telechat2", "TeleChat2ForCausalLM"), "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"), "TeleFLMForCausalLM": ("teleflm", "TeleFLMForCausalLM"), "XverseForCausalLM": ("llama", "LlamaForCausalLM"), @@ -207,6 +208,7 @@ _EMBEDDING_MODELS = { "Qwen2ForProcessRewardModel": ("qwen2_rm", "Qwen2ForProcessRewardModel"), "RobertaForMaskedLM": ("roberta", "RobertaEmbeddingModel"), "RobertaModel": ("roberta", "RobertaEmbeddingModel"), + "TeleChatForCausalLM": ("telechat2", "TeleChat2ForCausalLM"), "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"), "XLMRobertaModel": ("roberta", "RobertaEmbeddingModel"), # [Multimodal] From 3650a74ed8fb27d4d53199969f265e426c22891b Mon Sep 17 00:00:00 2001 From: yxt Date: Wed, 26 Nov 2025 21:16:12 +0800 Subject: [PATCH 005/197] =?UTF-8?q?Optimize=20the=20wording=20of=20the=20d?= =?UTF-8?q?ocument=20and=20unify=20the=20terminology=20and=20th=E2=80=A6?= =?UTF-8?q?=20(#29491)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/models/pooling_models.md | 46 +++++++++++++++++------------------ 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index 18bb645ea9a9c..aca865f4bf77d 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -1,15 +1,15 @@ # Pooling Models -vLLM also supports pooling models, such as embedding, classification and reward models. +vLLM also supports pooling models, such as embedding, classification, and reward models. In vLLM, pooling models implement the [VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface. These models use a [Pooler][vllm.model_executor.layers.pooler.Pooler] to extract the final hidden states of the input before returning them. !!! note - We currently support pooling models primarily as a matter of convenience. This is not guaranteed to have any performance improvement over using HF Transformers / Sentence Transformers directly. + We currently support pooling models primarily for convenience. This is not guaranteed to provide any performance improvements over using Hugging Face Transformers or Sentence Transformers directly. - We are now planning to optimize pooling models in vLLM. Please comment on if you have any suggestions! + We plan to optimize pooling models in vLLM. Please comment on if you have any suggestions! ## Configuration @@ -19,7 +19,7 @@ Run a model in pooling mode via the option `--runner pooling`. !!! tip There is no need to set this option in the vast majority of cases as vLLM can automatically - detect the model runner to use via `--runner auto`. + detect the appropriate model runner via `--runner auto`. ### Model Conversion @@ -78,7 +78,7 @@ When loading [Sentence Transformers](https://huggingface.co/sentence-transformer its Sentence Transformers configuration file (`modules.json`) takes priority over the model's defaults. You can further customize this via the `--pooler-config` option, -which takes priority over both the model's and Sentence Transformers's defaults. +which takes priority over both the model's and Sentence Transformers' defaults. ## Offline Inference @@ -168,11 +168,11 @@ The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM. - For embeddings, use `LLM.embed(...)` or `pooling_task="embed"`. - For classification logits, use `LLM.classify(...)` or `pooling_task="classify"`. - - For similarity scores, use `LLM.score(...)`. + - For similarity scores, use `LLM.score(...)`. - For rewards, use `LLM.reward(...)` or `pooling_task="token_classify"`. - For token classification, use `pooling_task="token_classify"`. - - For multi-vector retrieval, use `pooling_task="token_embed"` - - For IO Processor Plugins , use `pooling_task="plugin"` + - For multi-vector retrieval, use `pooling_task="token_embed"`. + - For IO Processor Plugins, use `pooling_task="plugin"`. ```python from vllm import LLM @@ -194,15 +194,15 @@ Our [OpenAI-Compatible Server](../serving/openai_compatible_server.md) provides - [Pooling API](../serving/openai_compatible_server.md#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models. !!! note - Please use one of the more specific methods or set the task directly when using [Pooling API](../serving/openai_compatible_server.md#pooling-api) api.: + Please use one of the more specific endpoints or set the task directly when using the [Pooling API](../serving/openai_compatible_server.md#pooling-api): - For embeddings, use [Embeddings API](../serving/openai_compatible_server.md#embeddings-api) or `"task":"embed"`. - - For classification logits, use [Classification API](../serving/openai_compatible_server.md#classification-api) or `task":"classify"`. - - For similarity scores, use [Score API](../serving/openai_compatible_server.md#score-api). - - For rewards, `task":"token_classify"`. - - For token classification, use `task":"token_classify"`. - - For multi-vector retrieval, use `task":"token_embed"` - - For IO Processor Plugins , use `task":"plugin"` + - For classification logits, use [Classification API](../serving/openai_compatible_server.md#classification-api) or `"task":"classify"`. + - For similarity scores, use [Score API](../serving/openai_compatible_server.md#score-api). + - For rewards, use `"task":"token_classify"`. + - For token classification, use `"task":"token_classify"`. + - For multi-vector retrieval, use `"task":"token_embed"`. + - For IO Processor Plugins, use `"task":"plugin"`. ```python # start a supported embeddings model server with `vllm serve`, e.g. @@ -232,7 +232,7 @@ for output in response.json()["data"]: ## Matryoshka Embeddings -[Matryoshka Embeddings](https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings) or [Matryoshka Representation Learning (MRL)](https://arxiv.org/abs/2205.13147) is a technique used in training embedding models. It allows user to trade off between performance and cost. +[Matryoshka Embeddings](https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings) or [Matryoshka Representation Learning (MRL)](https://arxiv.org/abs/2205.13147) is a technique used in training embedding models. It allows users to trade off between performance and cost. !!! warning Not all embedding models are trained using Matryoshka Representation Learning. To avoid misuse of the `dimensions` parameter, vLLM returns an error for requests that attempt to change the output dimension of models that do not support Matryoshka Embeddings. @@ -245,9 +245,9 @@ for output in response.json()["data"]: ### Manually enable Matryoshka Embeddings -There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, if `is_matryoshka` is `True` in `config.json,` it is allowed to change the output to arbitrary dimensions. Using `matryoshka_dimensions` can control the allowed output dimensions. +There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, if `is_matryoshka` is `True` in `config.json`, you can change the output dimension to arbitrary values. Use `matryoshka_dimensions` to control the allowed output dimensions. -For models that support Matryoshka Embeddings but not recognized by vLLM, please manually override the config using `hf_overrides={"is_matryoshka": True}`, `hf_overrides={"matryoshka_dimensions": []}` (offline) or `--hf-overrides '{"is_matryoshka": true}'`, `--hf-overrides '{"matryoshka_dimensions": []}'`(online). +For models that support Matryoshka Embeddings but are not recognized by vLLM, manually override the config using `hf_overrides={"is_matryoshka": True}` or `hf_overrides={"matryoshka_dimensions": []}` (offline), or `--hf-overrides '{"is_matryoshka": true}'` or `--hf-overrides '{"matryoshka_dimensions": []}'` (online). Here is an example to serve a model with Matryoshka Embeddings enabled. @@ -278,7 +278,7 @@ A code example can be found here: [examples/offline_inference/pooling/embed_matr ### Online Inference -Use the following command to start vllm server. +Use the following command to start the vLLM server. ```bash vllm serve jinaai/jina-embeddings-v3 --trust-remote-code @@ -310,11 +310,11 @@ An OpenAI client example can be found here: [examples/online_serving/pooling/ope ### Encode task -We have split the `encode` task into two more specific token wise tasks: `token_embed` and `token_classify`: +We have split the `encode` task into two more specific token-wise tasks: `token_embed` and `token_classify`: -- `token_embed` is the same as embed, using normalize as activation. -- `token_classify` is the same as classify, default using softmax as activation. +- `token_embed` is the same as `embed`, using normalization as the activation. +- `token_classify` is the same as `classify`, by default using softmax as the activation. ### Remove softmax from PoolingParams -We are going to remove `softmax` and `activation` from `PoolingParams`. Instead, you should set `use_activation`, since we actually allow `classify` and `token_classify` to use any activation function. +We are going to remove `softmax` and `activation` from `PoolingParams`. Instead, use `use_activation`, since we allow `classify` and `token_classify` to use any activation function. From 70d5953f820ec528e2b6050a7969130009410d1e Mon Sep 17 00:00:00 2001 From: Huamin Li <3ericli@gmail.com> Date: Wed, 26 Nov 2025 06:27:26 -0800 Subject: [PATCH 006/197] Revert "[Bugfix] Fix GPT-OSS AR+NORM fusion (#28841)" (#29483) Signed-off-by: Huamin Li <3ericli@gmail.com> --- .buildkite/test-pipeline.yaml | 1 - tests/compile/distributed/test_fusions_e2e.py | 11 ----------- .../device_communicators/symm_mem.py | 2 +- vllm/model_executor/layers/fused_moe/layer.py | 17 ++++++----------- 4 files changed, 7 insertions(+), 24 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 10a19c52c72dc..d14b524b793a5 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -972,7 +972,6 @@ steps: - vllm/model_executor/layers/layernorm.py - vllm/model_executor/layers/activation.py - vllm/model_executor/layers/quantization/input_quant_fp8.py - - vllm/model_executor/layers/fused_moe/layer.py - tests/compile/test_fusion_attn.py - tests/compile/test_silu_mul_quant_fusion.py - tests/compile/distributed/test_fusion_all_reduce.py diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py index 53c3f875d2003..661172e1965b5 100644 --- a/tests/compile/distributed/test_fusions_e2e.py +++ b/tests/compile/distributed/test_fusions_e2e.py @@ -111,17 +111,6 @@ if current_platform.is_cuda(): async_tp=96, # MLP is MoE, half the fusions of dense ), ), - ModelBackendTestCase( - model_name="openai/gpt-oss-20b", - model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"), - backend=AttentionBackendEnum.FLASHINFER, - matches=Matches( - attention_fusion=0, - allreduce_fusion=49, - sequence_parallel=49, - async_tp=48, - ), - ), ] elif current_platform.is_rocm(): diff --git a/vllm/distributed/device_communicators/symm_mem.py b/vllm/distributed/device_communicators/symm_mem.py index 7a049b003cf73..eb1f173b11925 100644 --- a/vllm/distributed/device_communicators/symm_mem.py +++ b/vllm/distributed/device_communicators/symm_mem.py @@ -131,7 +131,7 @@ class SymmMemCommunicator: return None if out is None: out = torch.empty_like(inp) - self.buffer[: inp.numel()].copy_(inp.reshape(-1)) + self.buffer[: inp.numel()].copy_(inp.view(-1)) # Determine which algorithm to use use_multimem = False diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index bb30f1292a5fa..0ef3130b26333 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1690,10 +1690,6 @@ class FusedMoE(CustomOp): ) def reduce_output(states: torch.Tensor) -> torch.Tensor: - # Slice before all_reduce to enable possible fusion - if self.hidden_size != og_hidden_states: - states = states[..., :og_hidden_states] - if ( not self.is_sequence_parallel and not self.use_dp_chunking @@ -1716,12 +1712,11 @@ class FusedMoE(CustomOp): if self.zero_expert_num is not None and self.zero_expert_num > 0: assert isinstance(fused_output, tuple) fused_output, zero_expert_result = fused_output - return ( - reduce_output(fused_output) - + zero_expert_result[..., :og_hidden_states] - ) + return (reduce_output(fused_output) + zero_expert_result)[ + ..., :og_hidden_states + ] else: - return reduce_output(fused_output) + return reduce_output(fused_output)[..., :og_hidden_states] else: if current_platform.is_tpu(): # TODO: Once the OOM issue for the TPU backend is resolved, we @@ -1734,8 +1729,8 @@ class FusedMoE(CustomOp): hidden_states, router_logits, self.layer_name ) return ( - reduce_output(shared_output), - reduce_output(fused_output), + reduce_output(shared_output)[..., :og_hidden_states], + reduce_output(fused_output)[..., :og_hidden_states], ) def forward_cuda( From 0b0aa874e85431a0f08a0d1fad95ae673e034392 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Wed, 26 Nov 2025 11:38:52 -0500 Subject: [PATCH 007/197] [Perf] Optimize batch invariant BMM, 18.1% Throughput improvement, 10.7% TTFT improvement (#29345) Signed-off-by: yewentao256 Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- tests/v1/determinism/test_batch_invariance.py | 3 - tests/v1/determinism/utils.py | 5 +- vllm/model_executor/layers/batch_invariant.py | 225 +++++++++++++++++- 3 files changed, 217 insertions(+), 16 deletions(-) diff --git a/tests/v1/determinism/test_batch_invariance.py b/tests/v1/determinism/test_batch_invariance.py index b9e2daafb8705..4311547baccf1 100644 --- a/tests/v1/determinism/test_batch_invariance.py +++ b/tests/v1/determinism/test_batch_invariance.py @@ -159,7 +159,6 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle( "backend", BACKENDS, ) -@pytest.mark.forked def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN( backend, monkeypatch: pytest.MonkeyPatch ): @@ -429,7 +428,6 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch): "backend", BACKENDS, ) -@pytest.mark.forked def test_logprobs_without_batch_invariance_should_fail( backend, monkeypatch: pytest.MonkeyPatch ): @@ -646,7 +644,6 @@ def test_logprobs_without_batch_invariance_should_fail( @skip_unsupported @pytest.mark.parametrize("backend", ["FLASH_ATTN"]) -@pytest.mark.forked def test_decode_logprobs_match_prefill_logprobs( backend, monkeypatch: pytest.MonkeyPatch ): diff --git a/tests/v1/determinism/utils.py b/tests/v1/determinism/utils.py index ecbb6a1126933..0d7da107728b4 100644 --- a/tests/v1/determinism/utils.py +++ b/tests/v1/determinism/utils.py @@ -8,6 +8,7 @@ import torch from vllm.attention.utils.fa_utils import flash_attn_supports_mla from vllm.platforms import current_platform +from vllm.utils.flashinfer import has_flashinfer skip_unsupported = pytest.mark.skipif( not (current_platform.is_cuda() and current_platform.has_device_capability(90)), @@ -16,9 +17,11 @@ skip_unsupported = pytest.mark.skipif( BACKENDS: list[str] = [ "FLASH_ATTN", - "FLASHINFER", ] +if has_flashinfer(): + BACKENDS.append("FLASHINFER") + if flash_attn_supports_mla(): BACKENDS.append("FLASH_ATTN_MLA") diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py index be7f673e5618f..4154122636dcf 100644 --- a/vllm/model_executor/layers/batch_invariant.py +++ b/vllm/model_executor/layers/batch_invariant.py @@ -215,6 +215,139 @@ def matmul_persistent( return c +@triton.jit +def bmm_kernel( + a_ptr, # (*, ) pointer to A, (B, M, K) + b_ptr, # (*, ) pointer to B, (B, K, N) + c_ptr, # (*, ) pointer to C, (B, M, N) + B, # int, batch size + M, # int, output rows + N, # int, output cols + K, # int, reduction dim + stride_ab, + stride_am, + stride_ak, + stride_bb, + stride_bk, + stride_bn, + stride_cb, + stride_cm, + stride_cn, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + A_LARGE: tl.constexpr, + B_LARGE: tl.constexpr, + C_LARGE: tl.constexpr, +): + """Batched GEMM: (B, M, K) x (B, K, N) -> (B, M, N) + + Each program computes one (batch_idx, tile_m, tile_n) tile, accumulating + along K in a fixed order to preserve batch invariance. + """ + pid_b = tl.program_id(0) + pid = tl.program_id(1) + + if pid_b >= B: + return + + # number of tiles along M / N + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + + if pid_m >= num_pid_m or pid_n >= num_pid_n: + return + + # offs_m / offs_n: raw global row/col indices for this tile + offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + # masks for valid logical rows/cols within (M, N) + mask_m = offs_m < M # [BLOCK_SIZE_M] + mask_n = offs_n < N # [BLOCK_SIZE_N] + + if A_LARGE or B_LARGE or C_LARGE: + offs_m = offs_m.to(tl.int64) + offs_n = offs_n.to(tl.int64) + + offs_m = tl.where(mask_m, offs_m, 0) + offs_n = tl.where(mask_n, offs_n, 0) + + # hint for triton contiguous memory + offs_m = tl.max_contiguous(tl.multiple_of(offs_m, BLOCK_SIZE_M), BLOCK_SIZE_M) + offs_n = tl.max_contiguous(tl.multiple_of(offs_n, BLOCK_SIZE_N), BLOCK_SIZE_N) + + # base pointers for current batch, shape-wise: + # a_batch_ptr points to A[pid_b, 0, 0] + # b_batch_ptr points to B[pid_b, 0, 0] + # c_batch_ptr points to C[pid_b, 0, 0] + a_batch_ptr = a_ptr + pid_b * stride_ab + b_batch_ptr = b_ptr + pid_b * stride_bb + c_batch_ptr = c_ptr + pid_b * stride_cb + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + # number of K-blocks this tile iterates over + k_tiles = tl.cdiv(K, BLOCK_SIZE_K) + offs_k_mask = tl.arange(0, BLOCK_SIZE_K) + + for ki in range(k_tiles): + if A_LARGE or B_LARGE: + # offs_k: [BLOCK_SIZE_K], global K indices + offs_k = ki * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K).to(tl.int64) + else: + offs_k = ki * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K) + + # a_ptrs: [BLOCK_SIZE_M, BLOCK_SIZE_K] + # element (i, j) points to A[pid_b, offs_m[i], offs_k[j]] + a_ptrs = a_batch_ptr + ( + offs_m[:, None] * stride_am + offs_k[None, :] * stride_ak + ) + # b_ptrs: [BLOCK_SIZE_K, BLOCK_SIZE_N] + # element (i, j) points to B[pid_b, offs_k[i], offs_n[j]] + b_ptrs = b_batch_ptr + ( + offs_k[:, None] * stride_bk + offs_n[None, :] * stride_bn + ) + + # valid K lanes for this block + k_valid = offs_k_mask < (K - ki * BLOCK_SIZE_K) + # A mask within (M, K): [BLOCK_SIZE_M, BLOCK_SIZE_K] + a_mask = mask_m[:, None] & k_valid[None, :] + # B mask within (K, N): [BLOCK_SIZE_K, BLOCK_SIZE_N] + b_mask = k_valid[:, None] & mask_n[None, :] + + # a: [BLOCK_SIZE_M, BLOCK_SIZE_K] from A[offs_m, offs_k] + a = tl.load( + a_ptrs, + mask=a_mask, + other=0.0, + ) + # b: [BLOCK_SIZE_K, BLOCK_SIZE_N] from B[offs_k, offs_n] + b = tl.load( + b_ptrs, + mask=b_mask, + other=0.0, + ) + accumulator = tl.dot(a, b, accumulator) + + # c_m / c_n: [BLOCK_SIZE_M] / [BLOCK_SIZE_N], row/col indices for C + c_m = offs_m + c_n = offs_n + if C_LARGE: + c_m = c_m.to(tl.int64) + c_n = c_n.to(tl.int64) + + # c_ptrs: [BLOCK_SIZE_M, BLOCK_SIZE_N] + # element (i, j) points to C[pid_b, c_m[i], c_n[j]] + c_ptrs = c_batch_ptr + stride_cm * c_m[:, None] + stride_cn * c_n[None, :] + # mask out elements that fall outside logical (M, N) range + c_mask = mask_m[:, None] & mask_n[None, :] + # cast FP32 accumulator back to original dtype of C + c = accumulator.to(c_ptr.dtype.element_ty) + tl.store(c_ptrs, c, mask=c_mask) + + @triton.jit def _log_softmax_kernel( input_ptr, @@ -526,23 +659,91 @@ def matmul_batch_invariant(a, b, *, out=None): def bmm_batch_invariant(a, b, *, out=None): # Batched matrix multiply: (B, M, K) x (B, K, N) -> (B, M, N) - # Process each batch separately with our persistent kernel - if a.ndim == 3 and b.ndim == 3: - results = [] - for i in range(a.shape[0]): - results.append(matmul_persistent(a[i], b[i])) - result = torch.stack(results, dim=0) - - if out is not None: - out.copy_(result) - return out - return result - else: + if not (a.ndim == 3 and b.ndim == 3): raise ValueError( f"bmm_batch_invariant expects 3D tensors, " f"got shapes {a.shape} and {b.shape}" ) + if a.shape[0] != b.shape[0]: + raise ValueError( + f"Batch dimensions of tensors must match, " + f"but got {a.shape[0]} and {b.shape[0]}." + ) + if a.shape[2] != b.shape[1]: + raise ValueError( + f"Incompatible inner dimensions for matmul: got {a.shape} and {b.shape}." + ) + if a.dtype != b.dtype: + raise ValueError(f"Incompatible dtypes: got {a.dtype} and {b.dtype}.") + + B, M, K = a.shape + _, _, N = b.shape + dtype = a.dtype + + if out is None: + c = torch.empty((B, M, N), device=a.device, dtype=dtype) + else: + assert out.shape == (B, M, N), "out tensor has incorrect shape" + assert out.dtype == dtype and out.device == a.device, "out tensor mismatch" + c = out + + configs = { + torch.bfloat16: { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "num_stages": 3, + "num_warps": 8, + }, + torch.float16: { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "num_stages": 3, + "num_warps": 8, + }, + torch.float32: { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "num_stages": 3, + "num_warps": 8, + }, + } + + cfg = configs[dtype] + # grid = (B, num_tiles_per_matrix) + grid = ( + B, + triton.cdiv(M, cfg["BLOCK_SIZE_M"]) * triton.cdiv(N, cfg["BLOCK_SIZE_N"]), + ) + + bmm_kernel[grid]( + a, + b, + c, + B, + M, + N, + K, + a.stride(0), + a.stride(1), + a.stride(2), + b.stride(0), + b.stride(1), + b.stride(2), + c.stride(0), + c.stride(1), + c.stride(2), + A_LARGE=a.numel() > 2**31, + B_LARGE=b.numel() > 2**31, + C_LARGE=c.numel() > 2**31, + **cfg, + ) + + return c + def addmm_batch_invariant(bias, a, b): return matmul_persistent(a, b, bias=bias) From e603129505fdf39b0784fe9600feb9101ed5170d Mon Sep 17 00:00:00 2001 From: HDCharles <39544797+HDCharles@users.noreply.github.com> Date: Wed, 26 Nov 2025 12:21:58 -0500 Subject: [PATCH 008/197] [refactor] CTConfig methods to static/class methods (#28870) Signed-off-by: HDCharles Co-authored-by: Isotr0py --- .../compressed_tensors/compressed_tensors.py | 55 +++++++++++-------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 6c7d4cd7bd9ab..2800f90ce0b67 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -266,8 +266,9 @@ class CompressedTensorsConfig(QuantizationConfig): def get_config_filenames(cls) -> list[str]: return [] + @staticmethod def _check_scheme_supported( - self, min_capability: int, error: bool = True, match_exact: bool = False + min_capability: int, error: bool = True, match_exact: bool = False ) -> bool: capability_tuple = current_platform.get_device_capability() @@ -293,9 +294,8 @@ class CompressedTensorsConfig(QuantizationConfig): else: return False - def _is_fp4a4_nvfp4( - self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs - ): + @staticmethod + def _is_fp4a4_nvfp4(weight_quant: QuantizationArgs, input_quant: QuantizationArgs): if weight_quant is None or input_quant is None: return False @@ -322,9 +322,8 @@ class CompressedTensorsConfig(QuantizationConfig): and is_symmetric ) - def _is_fp4a16_nvfp4( - self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs - ): + @staticmethod + def _is_fp4a16_nvfp4(weight_quant: QuantizationArgs, input_quant: QuantizationArgs): is_weight_only = weight_quant is not None and input_quant is None is_tensor_group_quant = ( weight_quant.strategy == QuantizationStrategy.TENSOR_GROUP.value @@ -344,8 +343,9 @@ class CompressedTensorsConfig(QuantizationConfig): and is_symmetric ) + @staticmethod def _is_static_tensor_w8a8( - self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs + weight_quant: QuantizationArgs, input_quant: QuantizationArgs ) -> bool: is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8 weight_strategy = ( @@ -362,8 +362,9 @@ class CompressedTensorsConfig(QuantizationConfig): # Only symmetric weight quantization supported. return is_8_bits and is_tensor and weight_quant.symmetric and is_static + @staticmethod def _is_dynamic_token_w8a8( - self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs + weight_quant: QuantizationArgs, input_quant: QuantizationArgs ) -> bool: is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8 weight_strategy = ( @@ -379,8 +380,9 @@ class CompressedTensorsConfig(QuantizationConfig): # Only symmetric weight quantization supported. return is_8_bits and is_token and weight_quant.symmetric and is_dynamic + @staticmethod def _is_dynamic_token_w4a8_int( - self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs + weight_quant: QuantizationArgs, input_quant: QuantizationArgs ) -> bool: is_weight_4_bits = weight_quant.num_bits == 4 is_activation_8_bits = input_quant.num_bits == 8 @@ -403,8 +405,9 @@ class CompressedTensorsConfig(QuantizationConfig): and is_dynamic ) + @staticmethod def _is_fp8_w8a8( - self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs + weight_quant: QuantizationArgs, input_quant: QuantizationArgs ) -> bool: # Confirm weights and activations quantized. if weight_quant is None or input_quant is None: @@ -439,8 +442,9 @@ class CompressedTensorsConfig(QuantizationConfig): is_per_tensor_activation = input_quant.strategy == QuantizationStrategy.TENSOR return is_symmetric_activation and is_per_tensor_activation + @staticmethod def _is_fp8_w4a8( - self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs + weight_quant: QuantizationArgs, input_quant: QuantizationArgs ) -> bool: if not weight_quant or not input_quant: return False @@ -462,29 +466,33 @@ class CompressedTensorsConfig(QuantizationConfig): and is_dynamic ) + @classmethod def _is_fp8_w4a8_sm90( - self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs + cls, weight_quant: QuantizationArgs, input_quant: QuantizationArgs ) -> bool: - return self._check_scheme_supported( + return cls._check_scheme_supported( 90, error=False, match_exact=True - ) and self._is_fp8_w4a8(weight_quant, input_quant) + ) and cls._is_fp8_w4a8(weight_quant, input_quant) + @classmethod def _is_fp8_w8a8_sm90( - self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs + cls, weight_quant: QuantizationArgs, input_quant: QuantizationArgs ) -> bool: - return self._check_scheme_supported( + return cls._check_scheme_supported( 90, error=False, match_exact=True - ) and self._is_fp8_w8a8(weight_quant, input_quant) + ) and cls._is_fp8_w8a8(weight_quant, input_quant) + @classmethod def _is_fp8_w8a8_sm100( - self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs + cls, weight_quant: QuantizationArgs, input_quant: QuantizationArgs ) -> bool: - return self._check_scheme_supported( + return cls._check_scheme_supported( 100, error=False, match_exact=True - ) and self._is_fp8_w8a8(weight_quant, input_quant) + ) and cls._is_fp8_w8a8(weight_quant, input_quant) + @staticmethod def _is_fp8_w8a16( - self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs + weight_quant: QuantizationArgs, input_quant: QuantizationArgs ) -> bool: # Confirm weights quantized. if weight_quant is None: @@ -508,8 +516,9 @@ class CompressedTensorsConfig(QuantizationConfig): and is_tensor_or_channel_or_block_weight ) + @staticmethod def _is_wNa16_group_channel( - self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs + weight_quant: QuantizationArgs, input_quant: QuantizationArgs ) -> bool: input_quant_none = input_quant is None is_channel_group = ( From c4c0354eec86f3486285f121fa184dd6d9cacb9d Mon Sep 17 00:00:00 2001 From: Alec <35311602+alec-flowers@users.noreply.github.com> Date: Wed, 26 Nov 2025 09:41:16 -0800 Subject: [PATCH 009/197] [CI/Build] allow user modify pplx and deepep ref by ENV or command line (#29131) Signed-off-by: alec-flowers --- docker/Dockerfile | 8 ++- tools/ep_kernels/install_python_libraries.sh | 66 +++++++++++++++++--- 2 files changed, 63 insertions(+), 11 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 84a1802dbe03a..aa3aad21d6c07 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -244,9 +244,15 @@ RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh # Install EP kernels(pplx-kernels and DeepEP) +ARG PPLX_COMMIT_HASH +ARG DEEPEP_COMMIT_HASH RUN --mount=type=cache,target=/root/.cache/uv \ export TORCH_CUDA_ARCH_LIST='9.0a 10.0a' && \ - /tmp/install_python_libraries.sh /tmp/ep_kernels_workspace wheel && \ + /tmp/install_python_libraries.sh \ + --workspace /tmp/ep_kernels_workspace \ + --mode wheel \ + ${PPLX_COMMIT_HASH:+--pplx-ref "$PPLX_COMMIT_HASH"} \ + ${DEEPEP_COMMIT_HASH:+--deepep-ref "$DEEPEP_COMMIT_HASH"} && \ find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete # Check the size of the wheel if RUN_WHEEL_CHECK is true diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh index 1cea1bef8dbc9..88be5cd778fff 100755 --- a/tools/ep_kernels/install_python_libraries.sh +++ b/tools/ep_kernels/install_python_libraries.sh @@ -1,22 +1,68 @@ #!/usr/bin/env bash set -ex -# usage: ./build.sh [workspace_dir] [mode] -# mode: "install" (default) → install directly into current Python env -# "wheel" → build wheels into WORKSPACE/dist +# usage: ./install_python_libraries.sh [options] +# --workspace workspace directory (default: ./ep_kernels_workspace) +# --mode "install" (default) or "wheel" +# --pplx-ref pplx-kernels commit hash +# --deepep-ref DeepEP commit hash + +CUDA_HOME=${CUDA_HOME:-/usr/local/cuda} +PPLX_COMMIT_HASH=${PPLX_COMMIT_HASH:-"12cecfd"} +DEEPEP_COMMIT_HASH=${DEEPEP_COMMIT_HASH:-"73b6ea4"} +NVSHMEM_VER=3.3.9 +WORKSPACE=${WORKSPACE:-$(pwd)/ep_kernels_workspace} +MODE=${MODE:-install} + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + --workspace) + if [[ -z "$2" || "$2" =~ ^- ]]; then + echo "Error: --workspace requires an argument." >&2 + exit 1 + fi + WORKSPACE="$2" + shift 2 + ;; + --mode) + if [[ -z "$2" || "$2" =~ ^- ]]; then + echo "Error: --mode requires an argument." >&2 + exit 1 + fi + MODE="$2" + shift 2 + ;; + --pplx-ref) + if [[ -z "$2" || "$2" =~ ^- ]]; then + echo "Error: --pplx-ref requires an argument." >&2 + exit 1 + fi + PPLX_COMMIT_HASH="$2" + shift 2 + ;; + --deepep-ref) + if [[ -z "$2" || "$2" =~ ^- ]]; then + echo "Error: --deepep-ref requires an argument." >&2 + exit 1 + fi + DEEPEP_COMMIT_HASH="$2" + shift 2 + ;; + *) + echo "Error: Unknown argument '$1'" >&2 + exit 1 + ;; + esac +done -WORKSPACE=${1:-$(pwd)/ep_kernels_workspace} -MODE=${2:-install} mkdir -p "$WORKSPACE" WHEEL_DIR="$WORKSPACE/dist" mkdir -p "$WHEEL_DIR" -NVSHMEM_VER=3.3.9 pushd "$WORKSPACE" -CUDA_HOME=${CUDA_HOME:-/usr/local/cuda} - # install dependencies if not installed if [ -z "$VIRTUAL_ENV" ]; then uv pip install --system cmake torch ninja @@ -133,7 +179,7 @@ do_build \ "https://github.com/ppl-ai/pplx-kernels" \ "pplx-kernels" \ "setup.py" \ - "12cecfd" \ + "$PPLX_COMMIT_HASH" \ "" # build DeepEP @@ -141,7 +187,7 @@ do_build \ "https://github.com/deepseek-ai/DeepEP" \ "DeepEP" \ "setup.py" \ - "73b6ea4" \ + "$DEEPEP_COMMIT_HASH" \ "export NVSHMEM_DIR=$WORKSPACE/nvshmem; " if [ "$MODE" = "wheel" ]; then From 430dd4d9eb7e342e28012351df06d93892f86741 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Wed, 26 Nov 2025 12:53:15 -0500 Subject: [PATCH 010/197] [Attention] Remove imports from `vllm/attention/__init__.py` (#29342) Signed-off-by: Matthew Bonanni --- docs/contributing/model/basic.md | 2 +- tests/compile/test_fusion_attn.py | 3 ++- tests/compile/test_qk_norm_rope_fusion.py | 3 ++- tests/kernels/utils.py | 2 +- tests/v1/worker/test_gpu_model_runner.py | 2 +- tests/v1/worker/test_utils.py | 4 ++-- vllm/attention/__init__.py | 19 ------------------- vllm/attention/backends/abstract.py | 2 +- vllm/attention/layer.py | 7 +++++-- vllm/compilation/fusion_attn.py | 2 +- vllm/compilation/qk_norm_rope_fusion.py | 2 +- .../kv_connector/v1/nixl_connector.py | 2 +- .../kv_connector/v1/offloading_connector.py | 3 ++- .../layers/mamba/linear_attn.py | 2 +- vllm/model_executor/model_loader/utils.py | 3 +-- vllm/model_executor/models/afmoe.py | 3 ++- vllm/model_executor/models/apertus.py | 3 ++- vllm/model_executor/models/arctic.py | 2 +- vllm/model_executor/models/baichuan.py | 2 +- vllm/model_executor/models/bailing_moe.py | 2 +- vllm/model_executor/models/bloom.py | 2 +- vllm/model_executor/models/chameleon.py | 2 +- vllm/model_executor/models/chatglm.py | 2 +- vllm/model_executor/models/clip.py | 3 +-- vllm/model_executor/models/commandr.py | 2 +- vllm/model_executor/models/dbrx.py | 2 +- vllm/model_executor/models/deepseek_v2.py | 2 +- vllm/model_executor/models/dots1.py | 2 +- vllm/model_executor/models/ernie45_moe.py | 2 +- vllm/model_executor/models/ernie45_vl_moe.py | 2 +- vllm/model_executor/models/exaone.py | 2 +- vllm/model_executor/models/exaone4.py | 2 +- vllm/model_executor/models/falcon.py | 2 +- vllm/model_executor/models/gemma.py | 2 +- vllm/model_executor/models/gemma2.py | 2 +- vllm/model_executor/models/gemma3.py | 3 ++- vllm/model_executor/models/gemma3n.py | 2 +- vllm/model_executor/models/glm4.py | 3 ++- vllm/model_executor/models/glm4_moe.py | 2 +- vllm/model_executor/models/gpt2.py | 2 +- vllm/model_executor/models/gpt_bigcode.py | 2 +- vllm/model_executor/models/gpt_j.py | 2 +- vllm/model_executor/models/gpt_neox.py | 2 +- vllm/model_executor/models/gpt_oss.py | 3 ++- vllm/model_executor/models/granite.py | 2 +- vllm/model_executor/models/granitemoe.py | 2 +- vllm/model_executor/models/grok1.py | 2 +- vllm/model_executor/models/hunyuan_v1.py | 3 ++- vllm/model_executor/models/internlm2.py | 2 +- vllm/model_executor/models/jais.py | 2 +- vllm/model_executor/models/lfm2.py | 2 +- vllm/model_executor/models/lfm2_moe.py | 2 +- vllm/model_executor/models/llama.py | 3 ++- vllm/model_executor/models/llama4.py | 2 +- vllm/model_executor/models/minicpm.py | 2 +- vllm/model_executor/models/minicpm3.py | 2 +- vllm/model_executor/models/minimax_m2.py | 2 +- vllm/model_executor/models/minimax_text_01.py | 3 ++- vllm/model_executor/models/mixtral.py | 2 +- vllm/model_executor/models/molmo.py | 3 +-- vllm/model_executor/models/mpt.py | 2 +- vllm/model_executor/models/nemotron.py | 2 +- vllm/model_executor/models/nemotron_nas.py | 2 +- vllm/model_executor/models/olmo.py | 2 +- vllm/model_executor/models/olmo2.py | 2 +- vllm/model_executor/models/olmoe.py | 2 +- vllm/model_executor/models/openpangu.py | 3 ++- vllm/model_executor/models/opt.py | 2 +- vllm/model_executor/models/orion.py | 2 +- vllm/model_executor/models/ouro.py | 3 ++- vllm/model_executor/models/persimmon.py | 2 +- vllm/model_executor/models/phi.py | 2 +- vllm/model_executor/models/phimoe.py | 2 +- vllm/model_executor/models/qwen.py | 2 +- vllm/model_executor/models/qwen2.py | 3 ++- vllm/model_executor/models/qwen2_moe.py | 2 +- vllm/model_executor/models/qwen3.py | 3 ++- vllm/model_executor/models/qwen3_moe.py | 2 +- vllm/model_executor/models/qwen3_next.py | 3 ++- vllm/model_executor/models/seed_oss.py | 3 ++- vllm/model_executor/models/solar.py | 2 +- vllm/model_executor/models/stablelm.py | 2 +- vllm/model_executor/models/starcoder2.py | 2 +- vllm/model_executor/models/step3_text.py | 2 +- .../models/transformers/base.py | 3 ++- vllm/model_executor/models/whisper.py | 4 ++-- vllm/platforms/cuda.py | 2 +- vllm/v1/attention/backends/cpu_attn.py | 2 +- vllm/v1/attention/backends/flash_attn.py | 2 +- vllm/v1/attention/backends/flex_attention.py | 2 +- vllm/v1/kv_offload/cpu.py | 2 +- vllm/v1/kv_offload/spec.py | 2 +- vllm/v1/kv_offload/worker/cpu_gpu.py | 2 +- vllm/v1/worker/gpu_model_runner.py | 3 ++- .../worker/kv_connector_model_runner_mixin.py | 2 +- vllm/v1/worker/tpu_model_runner.py | 3 +-- 96 files changed, 120 insertions(+), 121 deletions(-) diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md index e828de0adf3c2..a68d1f0162a10 100644 --- a/docs/contributing/model/basic.md +++ b/docs/contributing/model/basic.md @@ -29,7 +29,7 @@ The initialization code should look like this: ```python from torch import nn from vllm.config import VllmConfig - from vllm.attention import Attention + from vllm.attention.layer import Attention class MyAttention(nn.Module): def __init__(self, vllm_config: VllmConfig, prefix: str): diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index ea61c94953a77..dbe12dc5de705 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -9,8 +9,9 @@ from tests.compile.backend import LazyInitPass, TestBackend from tests.utils import flat_product from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant -from vllm.attention import Attention, AttentionMetadata +from vllm.attention.backends.abstract import AttentionMetadata from vllm.attention.backends.registry import AttentionBackendEnum +from vllm.attention.layer import Attention from vllm.attention.selector import global_force_attn_backend_context_manager from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass from vllm.compilation.fx_utils import find_op_nodes diff --git a/tests/compile/test_qk_norm_rope_fusion.py b/tests/compile/test_qk_norm_rope_fusion.py index 511e50f5fdc24..5ebb95b6db332 100644 --- a/tests/compile/test_qk_norm_rope_fusion.py +++ b/tests/compile/test_qk_norm_rope_fusion.py @@ -5,7 +5,8 @@ import pytest import torch from tests.compile.backend import TestBackend -from vllm.attention import Attention, AttentionType +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.layer import Attention from vllm.compilation.matcher_utils import FLASHINFER_ROTARY_OP, RMS_OP, ROTARY_OP from vllm.compilation.noop_elimination import NoOpEliminationPass from vllm.compilation.post_cleanup import PostCleanupPass diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index 9307ef7814a8b..b8148ce06b3fd 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -14,7 +14,7 @@ import torch from torch._prims_common import TensorLikeType from tests.kernels.quant_utils import native_w8a8_block_matmul -from vllm.attention import AttentionType +from vllm.attention.backends.abstract import AttentionType from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input from vllm.utils import ( diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index d0f1b703fcb92..89669ee8b71a0 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -5,8 +5,8 @@ import numpy as np import pytest import torch -from vllm.attention import Attention from vllm.attention.backends.abstract import MultipleOf +from vllm.attention.layer import Attention from vllm.config import ( CacheConfig, ModelConfig, diff --git a/tests/v1/worker/test_utils.py b/tests/v1/worker/test_utils.py index f987b09e603e7..bcf5611e35228 100644 --- a/tests/v1/worker/test_utils.py +++ b/tests/v1/worker/test_utils.py @@ -7,7 +7,7 @@ from vllm.v1.worker.utils import bind_kv_cache def test_bind_kv_cache(): - from vllm.attention import Attention + from vllm.attention.layer import Attention ctx = { "layers.0.self_attn": Attention(32, 128, 0.1), @@ -35,7 +35,7 @@ def test_bind_kv_cache(): def test_bind_kv_cache_non_attention(): - from vllm.attention import Attention + from vllm.attention.layer import Attention # example from Jamba PP=2 ctx = { diff --git a/vllm/attention/__init__.py b/vllm/attention/__init__.py index 8b4dc4013362e..e69de29bb2d1d 100644 --- a/vllm/attention/__init__.py +++ b/vllm/attention/__init__.py @@ -1,19 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from vllm.attention.backends.abstract import ( - AttentionBackend, - AttentionMetadata, - AttentionType, -) -from vllm.attention.layer import Attention -from vllm.attention.selector import get_attn_backend, get_mamba_attn_backend - -__all__ = [ - "Attention", - "AttentionBackend", - "AttentionMetadata", - "AttentionType", - "get_attn_backend", - "get_mamba_attn_backend", -] diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index bd7e81b15bfc3..a321167b8090f 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -178,7 +178,7 @@ class AttentionBackend(ABC): By default, only supports decoder attention. Backends should override this to support other attention types. """ - from vllm.attention import AttentionType + from vllm.attention.backends.abstract import AttentionType return attn_type == AttentionType.DECODER diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index f1d57ac50fb9f..62ac38751aa01 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -10,8 +10,11 @@ import torch.nn as nn import torch.nn.functional as F import vllm.envs as envs -from vllm.attention import AttentionType -from vllm.attention.backends.abstract import AttentionBackend, MLAAttentionImpl +from vllm.attention.backends.abstract import ( + AttentionBackend, + AttentionType, + MLAAttentionImpl, +) from vllm.attention.backends.registry import AttentionBackendEnum from vllm.attention.selector import get_attn_backend from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target diff --git a/vllm/compilation/fusion_attn.py b/vllm/compilation/fusion_attn.py index 4f44faece75e5..6dcbbd85d7031 100644 --- a/vllm/compilation/fusion_attn.py +++ b/vllm/compilation/fusion_attn.py @@ -10,7 +10,7 @@ from torch import fx from torch._higher_order_ops.auto_functionalize import auto_functionalized from torch._inductor.pattern_matcher import PatternMatcherPass -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.logger import init_logger from vllm.model_executor.layers.quantization.utils.quant_utils import ( diff --git a/vllm/compilation/qk_norm_rope_fusion.py b/vllm/compilation/qk_norm_rope_fusion.py index e3c399e079063..794cd8e3fce56 100644 --- a/vllm/compilation/qk_norm_rope_fusion.py +++ b/vllm/compilation/qk_norm_rope_fusion.py @@ -9,7 +9,7 @@ from torch import fx from torch._higher_order_ops.auto_functionalize import auto_functionalized from torch._inductor.pattern_matcher import PatternMatcherPass -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.logger import init_logger from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 493938d4aad92..ff51840b84b14 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -20,7 +20,7 @@ import torch import zmq from vllm import envs -from vllm.attention import AttentionBackend +from vllm.attention.backends.abstract import AttentionBackend from vllm.attention.backends.registry import AttentionBackendEnum from vllm.attention.selector import get_attn_backend from vllm.config import VllmConfig diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py index 8cd09014cab11..0ad9d4ae1b39f 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py @@ -8,7 +8,8 @@ from typing import Any, ClassVar import torch -from vllm.attention import Attention, AttentionBackend, AttentionMetadata +from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata +from vllm.attention.layer import Attention from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.distributed.kv_events import BlockRemoved, BlockStored, KVCacheEvent from vllm.distributed.kv_transfer.kv_connector.v1 import ( diff --git a/vllm/model_executor/layers/mamba/linear_attn.py b/vllm/model_executor/layers/mamba/linear_attn.py index d85b3e61c5d61..278713408c288 100644 --- a/vllm/model_executor/layers/mamba/linear_attn.py +++ b/vllm/model_executor/layers/mamba/linear_attn.py @@ -8,7 +8,7 @@ import torch.nn.functional as F from einops import rearrange from torch import nn -from vllm.attention import AttentionMetadata +from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config from vllm.distributed.communication_op import tensor_model_parallel_all_reduce from vllm.distributed.parallel_state import ( diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index 2021b68b8a60b..eeb2444150eef 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -11,8 +11,7 @@ import torch from torch import nn from typing_extensions import assert_never -from vllm.attention import Attention -from vllm.attention.layer import MLAAttention +from vllm.attention.layer import Attention, MLAAttention from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config from vllm.logger import init_logger from vllm.model_executor.layers.quantization.base_config import ( diff --git a/vllm/model_executor/models/afmoe.py b/vllm/model_executor/models/afmoe.py index 4eb5665a71fc8..85827d54c911a 100644 --- a/vllm/model_executor/models/afmoe.py +++ b/vllm/model_executor/models/afmoe.py @@ -9,7 +9,8 @@ from itertools import islice import torch from torch import nn -from vllm.attention import Attention, AttentionType +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py index b75e91319bbad..f38b09bf55068 100644 --- a/vllm/model_executor/models/apertus.py +++ b/vllm/model_executor/models/apertus.py @@ -32,7 +32,8 @@ import torch from torch import nn from transformers import ApertusConfig -from vllm.attention import Attention, AttentionType +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.layer import Attention from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index b75a254761d4e..266d29a8d9b2b 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -8,7 +8,7 @@ from itertools import islice import torch from torch import nn -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 024788918d024..beb22995a0719 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -29,7 +29,7 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py index cc10e936a2d3d..f7a5d4e7889e5 100644 --- a/vllm/model_executor/models/bailing_moe.py +++ b/vllm/model_executor/models/bailing_moe.py @@ -32,7 +32,7 @@ import torch.nn.functional as F from torch import nn from transformers.configuration_utils import PretrainedConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index 00fba93423d8e..507fbf1fdd0a8 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -27,7 +27,7 @@ import torch from torch import nn from transformers import BloomConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index b5a6d00dc309f..3aa01bb1905fe 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -16,7 +16,7 @@ from transformers import ( ChameleonVQVAEConfig, ) -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index dbfcd62d0bcab..3d485fdd0a2e1 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -12,7 +12,7 @@ import torch from torch import nn from torch.nn import LayerNorm -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 5d611deb942d1..c2993b47dc3f9 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -14,8 +14,7 @@ from transformers import ( CLIPVisionConfig, ) -from vllm.attention import Attention -from vllm.attention.layer import MultiHeadAttention +from vllm.attention.layer import Attention, MultiHeadAttention from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import divide, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 5ed920927c772..f837502c468f1 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -30,7 +30,7 @@ import torch from torch import nn from transformers import Cohere2Config, CohereConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index 2c729019081a4..946baffc8817a 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -8,7 +8,7 @@ import torch import torch.nn as nn from transformers import DbrxConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( get_pp_group, diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index ad932559b983d..73cac2556c55a 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -33,8 +33,8 @@ from torch import nn from transformers import DeepseekV2Config, DeepseekV3Config from vllm._aiter_ops import rocm_aiter_ops -from vllm.attention import Attention from vllm.attention.backends.abstract import AttentionBackend +from vllm.attention.layer import Attention from vllm.attention.ops.common import pack_seq_triton, unpack_seq_triton from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ParallelConfig, VllmConfig, get_current_vllm_config diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py index e65c275106a4e..1c2abbe7b3a78 100644 --- a/vllm/model_executor/models/dots1.py +++ b/vllm/model_executor/models/dots1.py @@ -32,7 +32,7 @@ import torch from torch import nn from transformers import Dots1Config -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py index a7df3509e3ecd..278ba45e9684c 100644 --- a/vllm/model_executor/models/ernie45_moe.py +++ b/vllm/model_executor/models/ernie45_moe.py @@ -32,7 +32,7 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py index 50e033d77606d..72f9957fc8828 100644 --- a/vllm/model_executor/models/ernie45_vl_moe.py +++ b/vllm/model_executor/models/ernie45_vl_moe.py @@ -31,7 +31,7 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention # from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index d13275488fe99..99002baa87529 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -32,7 +32,7 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py index 70f3cce2b7c56..9d2c67d6c4f80 100644 --- a/vllm/model_executor/models/exaone4.py +++ b/vllm/model_executor/models/exaone4.py @@ -28,7 +28,7 @@ import torch from torch import nn from transformers import Exaone4Config -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index dc2d51f340c8c..32d9e7b925597 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -30,7 +30,7 @@ from torch import nn from torch.nn import LayerNorm from transformers import FalconConfig as HF_FalconConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 00c7f59a08094..dd5a74c8ed005 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -26,7 +26,7 @@ import torch from torch import nn from transformers import GemmaConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index 9b6cfe6932300..cb36e04824588 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -23,7 +23,7 @@ import torch from torch import nn from transformers import Gemma2Config -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index 4ad6fc89dcaf2..73176eba95ed5 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -23,7 +23,8 @@ import torch.nn.functional as F from torch import nn from transformers import Gemma3TextConfig -from vllm.attention import Attention, AttentionType +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py index 8f1447ba34a81..f4427c9fd1d10 100644 --- a/vllm/model_executor/models/gemma3n.py +++ b/vllm/model_executor/models/gemma3n.py @@ -21,7 +21,7 @@ import torch from torch import nn from transformers.models.gemma3n.configuration_gemma3n import Gemma3nTextConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py index f8ef3b0385fb1..002cdb721e1db 100644 --- a/vllm/model_executor/models/glm4.py +++ b/vllm/model_executor/models/glm4.py @@ -29,7 +29,8 @@ import torch from torch import nn from transformers import Glm4Config -from vllm.attention import Attention, AttentionType +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 5aa51af54a00b..c99f824e1bd4d 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -31,7 +31,7 @@ import torch from torch import nn from transformers.models.glm4_moe import Glm4MoeConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index a5e8131c7fba9..da5d48a94ff3e 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -27,7 +27,7 @@ import torch from torch import nn from transformers import GPT2Config -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed.parallel_state import ( diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index cdf038ba25c92..a405fd184513f 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -28,7 +28,7 @@ import torch from torch import nn from transformers import GPTBigCodeConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index bd1bfea3c0fef..f0a34c47da54c 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -26,7 +26,7 @@ import torch from torch import nn from transformers import GPTJConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 815c2fba4d9fe..b9959682cbcef 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -26,7 +26,7 @@ import torch from torch import nn from transformers import GPTNeoXConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 1bc0ad38765d5..9de3e261941b1 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -7,7 +7,8 @@ import torch.distributed as dist from torch import nn from transformers import GptOssConfig -from vllm.attention import Attention, AttentionType +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index cd7ce2fc8f00a..eac9ef9478a6a 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -31,7 +31,7 @@ import torch from torch import nn from transformers import GraniteConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index 8f4139d63c3f6..02c6c5862141f 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -31,7 +31,7 @@ from typing import Any import torch from torch import nn -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py index cfca564920111..6f62a1d11e52e 100644 --- a/vllm/model_executor/models/grok1.py +++ b/vllm/model_executor/models/grok1.py @@ -31,7 +31,7 @@ import torch import torch.nn.functional as F from torch import nn -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py index 53fb444ed622d..ccdfa3fe175f1 100644 --- a/vllm/model_executor/models/hunyuan_v1.py +++ b/vllm/model_executor/models/hunyuan_v1.py @@ -33,7 +33,8 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention import Attention, AttentionType +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index dc8f821bd134f..c79934e121447 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -10,7 +10,7 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py index 5549a1fc1cd30..6012288814f15 100644 --- a/vllm/model_executor/models/jais.py +++ b/vllm/model_executor/models/jais.py @@ -28,7 +28,7 @@ from itertools import islice import torch from torch import nn -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py index 74bdde27ece5c..69615f8b6a099 100644 --- a/vllm/model_executor/models/lfm2.py +++ b/vllm/model_executor/models/lfm2.py @@ -7,7 +7,7 @@ import torch import torch.nn as nn from transformers import Lfm2Config -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/lfm2_moe.py b/vllm/model_executor/models/lfm2_moe.py index c088a08211527..aaeb2cc38999e 100644 --- a/vllm/model_executor/models/lfm2_moe.py +++ b/vllm/model_executor/models/lfm2_moe.py @@ -6,7 +6,7 @@ from itertools import islice import torch import torch.nn as nn -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index f6af2bb3b12e9..6dfbde7a17f54 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -31,7 +31,8 @@ import torch from torch import nn from transformers import LlamaConfig -from vllm.attention import Attention, AttentionType +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.layer import Attention from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index e1bdfc3405f70..423be45e80149 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -24,7 +24,7 @@ import torch from torch import nn from transformers import Llama4TextConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 04923833065f3..67911ba8c1c8f 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -33,7 +33,7 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py index 2d775219fc972..0a2bcbd7f6084 100644 --- a/vllm/model_executor/models/minicpm3.py +++ b/vllm/model_executor/models/minicpm3.py @@ -29,7 +29,7 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.layernorm import RMSNorm diff --git a/vllm/model_executor/models/minimax_m2.py b/vllm/model_executor/models/minimax_m2.py index 4955c68c0cda8..dd98e36ec0851 100644 --- a/vllm/model_executor/models/minimax_m2.py +++ b/vllm/model_executor/models/minimax_m2.py @@ -30,7 +30,7 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index 50f7396e2de60..390de78cc27b4 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -14,7 +14,8 @@ import torch from torch import nn from transformers import MiniMaxConfig -from vllm.attention import Attention, AttentionMetadata +from vllm.attention.backends.abstract import AttentionMetadata +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed.parallel_state import ( diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 0a9c3f136964e..e21656dbd6350 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -32,7 +32,7 @@ import torch from torch import nn from transformers import MixtralConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index dc06938d5d6e1..7b53299cccbe4 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -17,8 +17,7 @@ from transformers import BatchFeature, PretrainedConfig, ProcessorMixin, TensorT from transformers.image_utils import ImageInput from transformers.tokenization_utils_base import TextInput -from vllm.attention import Attention -from vllm.attention.layer import MultiHeadAttention +from vllm.attention.layer import Attention, MultiHeadAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 106ad971a321a..1e285646b9ec3 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -10,7 +10,7 @@ import torch import torch.nn as nn from transformers import MptConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index c3337bd1ea699..93ad2064a2fca 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -30,7 +30,7 @@ from itertools import islice import torch from torch import nn -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py index 2eebe38051cbd..34ea2945b711e 100644 --- a/vllm/model_executor/models/nemotron_nas.py +++ b/vllm/model_executor/models/nemotron_nas.py @@ -31,7 +31,7 @@ import torch from torch import nn from transformers import LlamaConfig -from vllm.attention import AttentionType +from vllm.attention.backends.abstract import AttentionType from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index bd8a8e317544f..3bbb4dd242262 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -31,7 +31,7 @@ import torch from torch import nn from transformers import OlmoConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index f0f6b2f6b3e6d..88e9c2d8541a1 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -32,7 +32,7 @@ import torch from torch import nn from transformers import Olmo2Config -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index c39e338d72e22..1376583a99725 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -21,7 +21,7 @@ from itertools import islice import torch from torch import nn -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig from vllm.distributed import ( diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py index 4124a181a14c2..bddd9fa50957a 100644 --- a/vllm/model_executor/models/openpangu.py +++ b/vllm/model_executor/models/openpangu.py @@ -29,7 +29,8 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention import Attention, AttentionType +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ParallelConfig, VllmConfig from vllm.distributed import ( diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index 5df700d1a2e17..bba5291ea5ef5 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -27,7 +27,7 @@ import torch from torch import nn from transformers import OPTConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index b30be93ca726f..544a44ed54681 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -15,7 +15,7 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/ouro.py b/vllm/model_executor/models/ouro.py index 63d2fff6ec8bc..dcae92ed20881 100644 --- a/vllm/model_executor/models/ouro.py +++ b/vllm/model_executor/models/ouro.py @@ -33,7 +33,8 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention import Attention, AttentionType +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index 98963d52e4848..795cd25f16753 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -30,7 +30,7 @@ import torch from torch import nn from transformers import PersimmonConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index da476f621627b..70016d9ed246c 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -45,7 +45,7 @@ import torch from torch import nn from transformers import PhiConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 8ffac95d93960..a5a669139b2f7 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -31,7 +31,7 @@ import torch from torch import nn from transformers.configuration_utils import PretrainedConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index c973e79170982..12285cf9c1968 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -16,7 +16,7 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 5831ce0b3d64b..34c31d8deee23 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -33,7 +33,8 @@ import torch from torch import nn from transformers import Qwen2Config -from vllm.attention import Attention, AttentionType +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.layer import Attention from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 6b97d0b2ca2e3..5a428740082f6 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -34,7 +34,7 @@ import torch.nn.functional as F from torch import nn from transformers import Qwen2MoeConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index 93a629d81e8ff..7d2b3e5f9bc79 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -30,7 +30,8 @@ import torch from torch import nn from transformers import Qwen3Config -from vllm.attention import Attention, AttentionType +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 8ee3dd99e11db..6f520706a3176 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -31,7 +31,7 @@ from typing import Any import torch from torch import nn -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index bfed64728305e..661a182151d74 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -10,7 +10,8 @@ from einops import rearrange from torch import nn from transformers.activations import ACT2FN -from vllm.attention import Attention, AttentionMetadata +from vllm.attention.backends.abstract import AttentionMetadata +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import ( CacheConfig, diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py index 4744d8e44f390..267c60157506d 100644 --- a/vllm/model_executor/models/seed_oss.py +++ b/vllm/model_executor/models/seed_oss.py @@ -30,7 +30,8 @@ import torch from torch import nn from transformers import PretrainedConfig as SeedOssConfig -from vllm.attention import Attention, AttentionType +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index 7e9fc51036d2e..c576154b1ecfd 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -30,7 +30,7 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index a738fcbb4ee28..6cb98b7b72a5b 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -29,7 +29,7 @@ import torch from torch import nn from transformers import StableLmConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 1118fca3cac91..46422f303ff43 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -28,7 +28,7 @@ import torch from torch import nn from transformers import Starcoder2Config -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py index 3c377a2c539df..077cce84a98dd 100644 --- a/vllm/model_executor/models/step3_text.py +++ b/vllm/model_executor/models/step3_text.py @@ -9,7 +9,7 @@ from typing import Any import torch from torch import nn -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( diff --git a/vllm/model_executor/models/transformers/base.py b/vllm/model_executor/models/transformers/base.py index f4ba4758bcc46..b33ce35427f5e 100644 --- a/vllm/model_executor/models/transformers/base.py +++ b/vllm/model_executor/models/transformers/base.py @@ -27,7 +27,8 @@ from torch import nn from transformers import AutoModel from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS -from vllm.attention import Attention, AttentionType +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.layer import Attention from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention from vllm.config.utils import getattr_iter from vllm.distributed import get_pp_group, get_tp_group diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 50587c627160d..c72b5e1c091f2 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -16,8 +16,8 @@ from transformers import ( ) from transformers.models.whisper.modeling_whisper import sinusoids -from vllm.attention import Attention, AttentionType -from vllm.attention.layer import MultiHeadAttention +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.layer import Attention, MultiHeadAttention from vllm.attention.layers.cross_attention import CrossAttention from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 75b6bc77e4c1c..e8e14387bb7f6 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -335,7 +335,7 @@ class CudaPlatformBase(Platform): use_sparse: bool, attn_type: str | None = None, ) -> str: - from vllm.attention import AttentionType + from vllm.attention.backends.abstract import AttentionType if attn_type is None: attn_type = AttentionType.DECODER diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py index 590bf91b0d057..d0b1f8c1b8071 100644 --- a/vllm/v1/attention/backends/cpu_attn.py +++ b/vllm/v1/attention/backends/cpu_attn.py @@ -51,7 +51,7 @@ class CPUAttentionBackend(AttentionBackend): @classmethod def supports_attn_type(cls, attn_type: str) -> bool: """CPU attention supports decoder and encoder-only attention.""" - from vllm.attention import AttentionType + from vllm.attention.backends.abstract import AttentionType return attn_type in ( AttentionType.DECODER, diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index a9a4af5ac1183..0fc57cfb1f9d3 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -84,7 +84,7 @@ class FlashAttentionBackend(AttentionBackend): @classmethod def supports_attn_type(cls, attn_type: str) -> bool: """FlashAttention supports all attention types.""" - from vllm.attention import AttentionType + from vllm.attention.backends.abstract import AttentionType return attn_type in ( AttentionType.DECODER, diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py index 7768827d26dc3..3869f1f4164c9 100644 --- a/vllm/v1/attention/backends/flex_attention.py +++ b/vllm/v1/attention/backends/flex_attention.py @@ -87,7 +87,7 @@ class FlexAttentionBackend(AttentionBackend): @classmethod def supports_attn_type(cls, attn_type: str) -> bool: """FlexAttention supports both decoder and encoder-only attention.""" - from vllm.attention import AttentionType + from vllm.attention.backends.abstract import AttentionType return attn_type in (AttentionType.DECODER, AttentionType.ENCODER_ONLY) diff --git a/vllm/v1/kv_offload/cpu.py b/vllm/v1/kv_offload/cpu.py index 86747299eb107..2f2e85c0ff332 100644 --- a/vllm/v1/kv_offload/cpu.py +++ b/vllm/v1/kv_offload/cpu.py @@ -4,7 +4,7 @@ from collections.abc import Iterator import torch -from vllm.attention import AttentionBackend +from vllm.attention.backends.abstract import AttentionBackend from vllm.config import VllmConfig from vllm.platforms import current_platform from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager diff --git a/vllm/v1/kv_offload/spec.py b/vllm/v1/kv_offload/spec.py index c1813a4ff4ea9..3afce55890752 100644 --- a/vllm/v1/kv_offload/spec.py +++ b/vllm/v1/kv_offload/spec.py @@ -11,7 +11,7 @@ from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager from vllm.v1.kv_offload.worker.worker import OffloadingHandler if TYPE_CHECKING: - from vllm.attention import AttentionBackend + from vllm.attention.backends.abstract import AttentionBackend from vllm.config import VllmConfig logger = init_logger(__name__) diff --git a/vllm/v1/kv_offload/worker/cpu_gpu.py b/vllm/v1/kv_offload/worker/cpu_gpu.py index bb163f0043fc6..461458c1f6ce8 100644 --- a/vllm/v1/kv_offload/worker/cpu_gpu.py +++ b/vllm/v1/kv_offload/worker/cpu_gpu.py @@ -5,7 +5,7 @@ import numpy as np import torch from vllm import _custom_ops as ops -from vllm.attention import AttentionBackend +from vllm.attention.backends.abstract import AttentionBackend from vllm.logger import init_logger from vllm.utils.platform_utils import is_pin_memory_available from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d3c61794f8b0d..581921a9bfe52 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -19,12 +19,13 @@ import torch.nn as nn from tqdm import tqdm import vllm.envs as envs -from vllm.attention import Attention, AttentionType from vllm.attention.backends.abstract import ( AttentionBackend, AttentionMetadata, + AttentionType, MultipleOf, ) +from vllm.attention.layer import Attention from vllm.compilation.counter import compilation_counter from vllm.compilation.cuda_graph import CUDAGraphWrapper from vllm.compilation.monitor import set_cudagraph_capturing_enabled diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py index ff047d8d03f0e..b799f1be73d9c 100644 --- a/vllm/v1/worker/kv_connector_model_runner_mixin.py +++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py @@ -13,7 +13,7 @@ from typing import ( import torch -from vllm.attention import AttentionBackend +from vllm.attention.backends.abstract import AttentionBackend from vllm.config import VllmConfig from vllm.config.cache import CacheDType from vllm.distributed.kv_transfer import ( diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 72d4474b89627..9c1fbfd24149d 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -17,9 +17,8 @@ import torch_xla.distributed.spmd as xs import torch_xla.runtime as xr import vllm.envs as envs -from vllm.attention import Attention from vllm.attention.backends.abstract import AttentionType -from vllm.attention.layer import MLAAttention +from vllm.attention.layer import Attention, MLAAttention from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention from vllm.compilation.wrapper import TorchCompileWithNoGuardsWrapper from vllm.config import ( From 56539cddac9eeab0a91941d8de689a6cae5dbe05 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Wed, 26 Nov 2025 14:07:13 -0500 Subject: [PATCH 011/197] [Core] Refactor padding logic and pad for CUDA graphs before attention metadata building (#28579) --- docs/design/cuda_graphs.md | 8 +- tests/v1/cudagraph/test_cudagraph_dispatch.py | 43 +- vllm/forward_context.py | 18 +- vllm/v1/attention/backends/flashinfer.py | 21 +- vllm/v1/attention/backends/mamba_attn.py | 2 + vllm/v1/attention/backends/utils.py | 5 +- vllm/v1/cudagraph_dispatcher.py | 97 ++-- vllm/v1/worker/dp_utils.py | 17 +- vllm/v1/worker/gpu_model_runner.py | 432 ++++++++++-------- vllm/v1/worker/gpu_worker.py | 41 +- 10 files changed, 401 insertions(+), 283 deletions(-) diff --git a/docs/design/cuda_graphs.md b/docs/design/cuda_graphs.md index 66bf3b27d1f52..7baadf8ba23cb 100644 --- a/docs/design/cuda_graphs.md +++ b/docs/design/cuda_graphs.md @@ -84,12 +84,14 @@ See the following figures for a quick comparison between the previous and curren ```python class BatchDescriptor(NamedTuple): num_tokens: int - uniform_decode: bool = False + num_reqs: int + uniform: bool = False + has_lora: bool = False ``` -where `num_tokens` can be the padded token length, and `uniform_decode` is determined by if `max_query_len` of a batch is equal to the desired `max_query_len` of a uniform_decode, and the num_scheduled_tokens is divisible by that desired `max_query_len`. +where `num_tokens` can be the padded token length, and `uniform` indicates if all the requests have the same query lengths. Many attention backends only support full cudagraphs when the batches are uniform; pure decode batches are uniform but may not be query length 1 (i.e. `num_tokens == num_reqs`), this occurs in the validation pass of spec-decode where "decode" batches will have a query length of `1+num_spec_tokens`. -The goal of this structure is to uniquely identify a (padded) batch with minimal possible items corresponding to a CUDA Graphs item. We are safe to exclude items like `uniform_query_len` because it is a constant at runtime for a certain setup currently. For example, it should be either `1` for a commonly pure decode or `1+num_spec_tokens` for a validation phase of speculative decode. +The goal of this structure is to uniquely identify a (padded) batch with minimal possible items corresponding to a CUDA Graphs item. !!! note The prototype of `BatchDescriptor` may be extended for more general situations in the future, e.g., include more items, like `uniform_query_len` to support multiple different uniform decode lengths settings (), or other modifications needed to support CUDA Graphs for models whose inputs are not necessarily token length aware (for example, some multi-modal inputs). diff --git a/tests/v1/cudagraph/test_cudagraph_dispatch.py b/tests/v1/cudagraph/test_cudagraph_dispatch.py index bb953e5c70c8c..314e7094ef97f 100644 --- a/tests/v1/cudagraph/test_cudagraph_dispatch.py +++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py @@ -42,12 +42,24 @@ def _create_vllm_config( mock_config.compilation_config = compilation_config mock_config.scheduler_config = SchedulerConfig(max_num_seqs=max_num_seqs) mock_config.parallel_config = ParallelConfig() + mock_config.speculative_config = None # No speculative decoding if not lora_config: mock_config.lora_config = None # Mimic the behavior of VllmConfig.__post_init__() if compilation_config.mode == CompilationMode.VLLM_COMPILE: compilation_config.set_splitting_ops_for_v1() + # mimic VllmConfig.__post_init__ + if compilation_config.cudagraph_capture_sizes: + compilation_config.max_cudagraph_capture_size = ( + compilation_config.cudagraph_capture_sizes[-1] + ) + + compilation_config.post_init_cudagraph_sizes() + mock_config.pad_for_cudagraph = ( + lambda batch_size: compilation_config.bs_to_padded_graph_size[batch_size] + ) + return mock_config @@ -109,9 +121,11 @@ class TestCudagraphDispatcher: # 1. non-uniform batch, size in cudagraph size list desc_full_exact = BatchDescriptor( num_tokens=8, - uniform_decode=False, + uniform=False, + ) + rt_mode, key = dispatcher.dispatch( + num_tokens=8, uniform_decode=False, has_lora=False ) - rt_mode, key = dispatcher.dispatch(desc_full_exact) if cudagraph_mode_str == "FULL": assert rt_mode == CUDAGraphMode.FULL assert key == desc_full_exact @@ -122,32 +136,37 @@ class TestCudagraphDispatcher: assert rt_mode == CUDAGraphMode.NONE # 2. uniform decode batch, size in cudagraph size list - desc_uniform_exact = BatchDescriptor(num_tokens=8, uniform_decode=True) - rt_mode, key = dispatcher.dispatch(desc_uniform_exact) + desc_uniform_exact = BatchDescriptor(num_tokens=8, num_reqs=8, uniform=True) + rt_mode, key = dispatcher.dispatch( + num_tokens=8, uniform_decode=True, has_lora=False + ) if cudagraph_mode_str == "FULL": assert rt_mode == CUDAGraphMode.FULL - assert key == desc_uniform_exact.non_uniform + assert key == desc_uniform_exact.relax_for_mixed_batch_cudagraphs() elif cudagraph_mode_str in ["FULL_DECODE_ONLY", "FULL_AND_PIECEWISE"]: assert rt_mode == CUDAGraphMode.FULL assert key == desc_uniform_exact elif cudagraph_mode_str == "PIECEWISE": assert rt_mode == CUDAGraphMode.PIECEWISE - assert key == desc_uniform_exact.non_uniform + assert key == desc_uniform_exact.relax_for_mixed_batch_cudagraphs() else: assert rt_mode == CUDAGraphMode.NONE # 3. No key match - desc_no_match = BatchDescriptor(num_tokens=15, uniform_decode=False) - rt_mode, key = dispatcher.dispatch(desc_no_match) + rt_mode, key = dispatcher.dispatch( + num_tokens=15, uniform_decode=False, has_lora=False + ) assert rt_mode == CUDAGraphMode.NONE - assert key is None + assert key == BatchDescriptor(num_tokens=15) # 4. Cascade attention should have a fall back mode - desc_full_exact = BatchDescriptor(num_tokens=8, uniform_decode=False) - rt_mode, key = dispatcher.dispatch(desc_full_exact, use_cascade_attn=True) + desc_full_exact = BatchDescriptor(num_tokens=8, uniform=False) + rt_mode, key = dispatcher.dispatch( + num_tokens=8, uniform_decode=False, has_lora=False, use_cascade_attn=True + ) if "PIECEWISE" in cudagraph_mode_str: # string contains check assert rt_mode == CUDAGraphMode.PIECEWISE - assert key == desc_full_exact.non_uniform + assert key == desc_full_exact.relax_for_mixed_batch_cudagraphs() else: assert rt_mode == CUDAGraphMode.NONE diff --git a/vllm/forward_context.py b/vllm/forward_context.py index 7cb490e391abb..635419bc7cad4 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -35,23 +35,27 @@ class BatchDescriptor(NamedTuple): """ num_tokens: int - uniform_decode: bool = False + num_reqs: int | None = None """ - False can also be used for an uniform decode batch to dispatch to the - cudagraph supporting non-uniform batches. + Number of requests in the batch. Can be None for PIECEWISE cudagraphs where + the cudagraphs can handle any number of requests. + """ + uniform: bool = False + """ + True if all the requests in the batch have the same number of tokens. """ has_lora: bool = False """ Whether this batch has active LoRA adapters. """ - @property - def non_uniform(self) -> "BatchDescriptor": + def relax_for_mixed_batch_cudagraphs(self) -> "BatchDescriptor": """ - Return a non-uniform version of current batch descriptor. + Return a relaxed version of current batch descriptor that is still compatible + with PIECEWISE cudagraphs (or mixed prefill-decode FA cudagraphs). """ return BatchDescriptor( - self.num_tokens, uniform_decode=False, has_lora=self.has_lora + self.num_tokens, num_reqs=None, uniform=False, has_lora=self.has_lora ) diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 8159f4096107f..dbd72b298b1fd 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -930,31 +930,12 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): if num_decodes > 0: pure_decode = num_prefills == 0 - # possible required padding for cudagraph replay use_cudagraph = ( self.enable_cuda_graph and pure_decode and num_decode_tokens <= self._decode_cudagraph_max_bs ) - if use_cudagraph: - num_input_tokens = self.vllm_config.pad_for_cudagraph( - num_decode_tokens - ) - # Carefully fulfill the padding region with reasonable value - # on cpu. - # Make sure paged_kv_indptr_cpu is not decreasing - self.paged_kv_indptr_cpu[ - 1 + num_decodes : 1 + num_input_tokens - ].fill_(paged_kv_indptr_cpu[-1]) - # Fill the remaining paged_kv_last_page_len_cpu with 1. - # This is because flashinfer treats 0 as a full page - # instead of empty. - self.paged_kv_last_page_len_cpu[num_decodes:num_input_tokens].fill_( - 1 - ) - - else: - num_input_tokens = num_decode_tokens + num_input_tokens = num_decode_tokens attn_metadata.decode_wrapper = self._get_decode_wrapper( num_input_tokens, use_cudagraph diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba_attn.py index 0d875565fc99a..a9705db59f19d 100644 --- a/vllm/v1/attention/backends/mamba_attn.py +++ b/vllm/v1/attention/backends/mamba_attn.py @@ -107,6 +107,8 @@ class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC): ) # -1 in case it's non-computed and causes later issues with indexing block_idx_last_computed_token = block_idx_last_computed_token.clamp(min=0) + # -1 in the case we have a padded request (0 seq-len) + block_idx_last_scheduled_token = block_idx_last_scheduled_token.clamp(min=0) return ( block_idx_last_computed_token, diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index cebfe8a3ff04e..18e91fd4fd6a5 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -72,6 +72,7 @@ class CommonAttentionMetadata: num_reqs: int """Number of requests""" + # TODO(lucas): rename to num_tokens since it may be padded and this is misleading num_actual_tokens: int """Total number of tokens in batch""" max_query_len: int @@ -857,7 +858,9 @@ def split_decodes_and_prefills( if require_uniform: is_prefill = query_lens != query_lens[0] else: - is_prefill = query_lens > decode_threshold + # 0-query len indicates a padded request; leave this at the back + # of the batch with the prefills + is_prefill = (query_lens > decode_threshold) | (query_lens == 0) if not torch.any(is_prefill): return num_reqs, 0, num_tokens, 0 diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py index b480ac78f23cf..ef0f8d9e67452 100644 --- a/vllm/v1/cudagraph_dispatcher.py +++ b/vllm/v1/cudagraph_dispatcher.py @@ -4,6 +4,9 @@ from itertools import product from vllm.config import CUDAGraphMode, VllmConfig from vllm.forward_context import BatchDescriptor +from vllm.logger import init_logger + +logger = init_logger(__name__) class CudagraphDispatcher: @@ -28,7 +31,11 @@ class CudagraphDispatcher: def __init__(self, vllm_config: VllmConfig): self.vllm_config = vllm_config self.compilation_config = vllm_config.compilation_config - self.cudagraph_mode = self.compilation_config.cudagraph_mode + self.uniform_decode_query_len = ( + 1 + if not self.vllm_config.speculative_config + else 1 + self.vllm_config.speculative_config.num_speculative_tokens + ) # Dict to store valid cudagraph dispatching keys. self.cudagraph_keys: dict[CUDAGraphMode, set[BatchDescriptor]] = { @@ -36,25 +43,42 @@ class CudagraphDispatcher: CUDAGraphMode.FULL: set(), } - not_use_piecewise_compilation = ( - not self.cudagraph_mode.requires_piecewise_compilation() - ) - assert ( - not_use_piecewise_compilation + not self.compilation_config.cudagraph_mode.requires_piecewise_compilation() or self.compilation_config.is_attention_compiled_piecewise() ), ( "Compilation mode should be CompilationMode.VLLM_COMPILE when " "cudagraph_mode piecewise cudagraphs is used, " "and attention should be in splitting_ops or " "inductor splitting should be used. " - f"cudagraph_mode={self.cudagraph_mode}, " + f"cudagraph_mode={self.compilation_config.cudagraph_mode}, " f"compilation_mode={self.compilation_config.mode}, " f"splitting_ops={self.compilation_config.splitting_ops}" ) self.keys_initialized = False + def _create_padded_batch_descriptor( + self, num_tokens: int, uniform_decode: bool, has_lora: bool + ) -> BatchDescriptor: + max_num_seqs = self.vllm_config.scheduler_config.max_num_seqs + uniform_decode_query_len = self.uniform_decode_query_len + num_tokens_padded = self.vllm_config.pad_for_cudagraph(num_tokens) + + if uniform_decode and self.cudagraph_mode.has_mode(CUDAGraphMode.FULL): + num_reqs = num_tokens_padded // uniform_decode_query_len + assert num_tokens_padded % uniform_decode_query_len == 0 + else: + uniform_decode = False + num_reqs = min(num_tokens_padded, max_num_seqs) + + return BatchDescriptor( + num_tokens=num_tokens_padded, + num_reqs=num_reqs, + uniform=uniform_decode, + has_lora=has_lora, + ) + def add_cudagraph_key( self, runtime_mode: CUDAGraphMode, batch_descriptor: BatchDescriptor ): @@ -66,7 +90,9 @@ class CudagraphDispatcher: def initialize_cudagraph_keys( self, cudagraph_mode: CUDAGraphMode, uniform_decode_query_len: int ): - # This should be called only after attention backend is initialized. + # This should be called only after attention backend is initialized. So we can + # get the correct cudagraph mode after backend support is resolved. + self.cudagraph_mode = cudagraph_mode # LoRA activation cases to specialize the cuda graphs on if self.vllm_config.lora_config: @@ -86,9 +112,9 @@ class CudagraphDispatcher: ): self.add_cudagraph_key( cudagraph_mode.mixed_mode(), - BatchDescriptor( - num_tokens=bs, uniform_decode=False, has_lora=has_lora - ), + self._create_padded_batch_descriptor( + bs, False, has_lora + ).relax_for_mixed_batch_cudagraphs(), ) # if decode cudagraph mode is FULL, and we don't already have mixed @@ -109,40 +135,49 @@ class CudagraphDispatcher: for bs, has_lora in product(cudagraph_capture_sizes_for_decode, lora_cases): self.add_cudagraph_key( CUDAGraphMode.FULL, - BatchDescriptor( - num_tokens=bs, uniform_decode=True, has_lora=has_lora - ), + self._create_padded_batch_descriptor(bs, True, has_lora), ) + self.keys_initialized = True def dispatch( - self, batch_descriptor: BatchDescriptor, use_cascade_attn: bool = False - ) -> tuple[CUDAGraphMode, BatchDescriptor | None]: + self, + num_tokens: int, + uniform_decode: bool, + has_lora: bool, + use_cascade_attn: bool = False, + ) -> tuple[CUDAGraphMode, BatchDescriptor]: """ Given conditions(e.g.,batch descriptor and if using cascade attention), dispatch to a cudagraph runtime mode and the valid batch descriptor. A new batch descriptor is returned as we might dispatch a uniform batch to a graph that supports a more general batch (uniform to non-uniform). """ - # if not initialized, just skip dispatching. - if not self.keys_initialized: - return CUDAGraphMode.NONE, None + if ( + not self.keys_initialized + or self.cudagraph_mode == CUDAGraphMode.NONE + or num_tokens > self.compilation_config.max_cudagraph_capture_size + ): + return CUDAGraphMode.NONE, BatchDescriptor(num_tokens) + + batch_desc = self._create_padded_batch_descriptor( + num_tokens, uniform_decode, has_lora + ) + relaxed_batch_desc = batch_desc.relax_for_mixed_batch_cudagraphs() - non_uniform_key = batch_descriptor.non_uniform - # if a batch use cascade attention, bypass checking full cudagraphs if not use_cascade_attn: # check if key exists for full cudagraph - if batch_descriptor in self.cudagraph_keys[CUDAGraphMode.FULL]: - return CUDAGraphMode.FULL, batch_descriptor + if batch_desc in self.cudagraph_keys[CUDAGraphMode.FULL]: + return CUDAGraphMode.FULL, batch_desc - # otherwise, check if non-uniform key exists - if non_uniform_key in self.cudagraph_keys[CUDAGraphMode.FULL]: - return CUDAGraphMode.FULL, non_uniform_key + # otherwise, check if the relaxed key exists + if relaxed_batch_desc in self.cudagraph_keys[CUDAGraphMode.FULL]: + return CUDAGraphMode.FULL, relaxed_batch_desc - # also check if non-uniform key exists for more "general" + # also check if the relaxed key exists for more "general" # piecewise cudagraph - if non_uniform_key in self.cudagraph_keys[CUDAGraphMode.PIECEWISE]: - return CUDAGraphMode.PIECEWISE, non_uniform_key + if relaxed_batch_desc in self.cudagraph_keys[CUDAGraphMode.PIECEWISE]: + return CUDAGraphMode.PIECEWISE, relaxed_batch_desc - # finally, just return no cudagraphs - return CUDAGraphMode.NONE, None + # finally, just return no cudagraphs and a trivial batch descriptor + return CUDAGraphMode.NONE, BatchDescriptor(num_tokens) diff --git a/vllm/v1/worker/dp_utils.py b/vllm/v1/worker/dp_utils.py index 464fbf11a21ad..064f2f0360cbf 100644 --- a/vllm/v1/worker/dp_utils.py +++ b/vllm/v1/worker/dp_utils.py @@ -9,6 +9,7 @@ from vllm.config import ParallelConfig from vllm.distributed.parallel_state import get_dp_group from vllm.logger import init_logger from vllm.v1.worker.ubatch_utils import ( + UBatchSlice, UBatchSlices, check_ubatch_thresholds, create_ubatch_slices, @@ -88,6 +89,17 @@ def _post_process_dp_padding(tensor: torch.Tensor, should_dp_pad: bool) -> torch return num_tokens_across_dp.cpu() +# This just pads the second ubatch slice out to the total number of tokens +# (num_tokens + padding) since we do `create_ubatch_slices` before applying DP padding. +def _pad_out_ubatch_slice(ubatch_slices: UBatchSlices, num_total_tokens: int): + padded_second_ubatch_slice = slice( + ubatch_slices[1].token_slice.start, num_total_tokens + ) + ubatch_slices[1] = UBatchSlice( + padded_second_ubatch_slice, padded_second_ubatch_slice + ) + + def _synchronize_dp_ranks( num_tokens_unpadded: int, num_tokens_padded: int, @@ -220,11 +232,14 @@ def coordinate_batch_across_dp( # to the second ubatch in pad_out_ubatch_slice after attention # metadata creation assert num_tokens_after_padding is not None - token_split_point = int(num_tokens_after_padding[0].item()) // 2 + num_tokens_padded = int(num_tokens_after_padding[0].item()) + token_split_point = int(num_tokens_padded) // 2 assert num_scheduled_tokens_per_request is not None ubatch_slices = create_ubatch_slices( num_scheduled_tokens_per_request, token_split_point ) + ubatch_slices = _pad_out_ubatch_slice(ubatch_slices, num_tokens_padded) + assert sum(s.num_tokens for s in ubatch_slices) == num_tokens_padded return (ubatch_slices, num_tokens_after_padding) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 581921a9bfe52..0ae4eb48acf22 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -151,7 +151,6 @@ from vllm.v1.worker.gpu_ubatch_wrapper import UBatchWrapper from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin from vllm.v1.worker.ubatch_utils import ( - UBatchSlice, UBatchSlices, check_ubatch_thresholds, ) @@ -1239,17 +1238,13 @@ class GPUModelRunner( self, scheduler_output: "SchedulerOutput", num_scheduled_tokens: np.ndarray, - max_num_scheduled_tokens: int, ) -> tuple[ torch.Tensor, SpecDecodeMetadata | None, - UBatchSlices | None, - torch.Tensor | None, ]: """ :return: tuple[ logits_indices, spec_decode_metadata, - ubatch_slices, num_tokens_across_dp, ] """ total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens @@ -1364,28 +1359,6 @@ class GPUModelRunner( self.query_start_loc.copy_to_gpu() query_start_loc = self.query_start_loc.gpu[: num_reqs + 1] - num_tokens_unpadded = scheduler_output.total_num_scheduled_tokens - num_tokens_padded = self._get_num_input_tokens(num_tokens_unpadded) - uniform_decode = ( - max_num_scheduled_tokens == self.uniform_decode_query_len - ) and (total_num_scheduled_tokens == num_reqs * max_num_scheduled_tokens) - - # Disable DP padding when running eager to avoid excessive padding when - # running prefills. This lets us set enforce_eager on the prefiller in - # a P/D setup and still use CUDA graphs (enabled by this padding) on the - # decoder. - allow_dp_padding = self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE - - ubatch_slices, num_tokens_across_dp = coordinate_batch_across_dp( - num_tokens_unpadded=num_tokens_unpadded, - parallel_config=self.parallel_config, - allow_microbatching=True, - allow_dp_padding=allow_dp_padding, - num_tokens_padded=num_tokens_padded, - uniform_decode=uniform_decode, - num_scheduled_tokens_per_request=num_scheduled_tokens, - ) - self.seq_lens.np[:num_reqs] = ( self.input_batch.num_computed_tokens_cpu[:num_reqs] + num_scheduled_tokens ) @@ -1486,15 +1459,15 @@ class GPUModelRunner( return ( logits_indices, spec_decode_metadata, - ubatch_slices, - num_tokens_across_dp, ) def _build_attention_metadata( self, - total_num_scheduled_tokens: int, - max_num_scheduled_tokens: int, + num_tokens: int, num_reqs: int, + max_query_len: int, + num_tokens_padded: int | None = None, + num_reqs_padded: int | None = None, ubatch_slices: UBatchSlices | None = None, logits_indices: torch.Tensor | None = None, use_spec_decode: bool = False, @@ -1505,6 +1478,9 @@ class GPUModelRunner( """ :return: tuple[attn_metadata, spec_decode_common_attn_metadata] """ + num_tokens_padded = num_tokens_padded or num_tokens + num_reqs_padded = num_reqs_padded or num_reqs + logits_indices_padded = None num_logits_indices = None if logits_indices is not None: @@ -1522,28 +1498,13 @@ class GPUModelRunner( self.dcp_rank, self.parallel_config.cp_kv_cache_interleave_size, ) - self.dcp_local_seq_lens.copy_to_gpu(num_reqs) + self.dcp_local_seq_lens.cpu[num_reqs:].fill_(0) + self.dcp_local_seq_lens.copy_to_gpu(num_reqs_padded) attn_metadata: PerLayerAttnMetadata = {} if ubatch_slices is not None: attn_metadata = [dict() for _ in range(len(ubatch_slices))] - # Used in the below loop - query_start_loc = self.query_start_loc.gpu[: num_reqs + 1] - query_start_loc_cpu = self.query_start_loc.cpu[: num_reqs + 1] - seq_lens = self.seq_lens.gpu[:num_reqs] - seq_lens_cpu = self.seq_lens.cpu[:num_reqs] - num_computed_tokens_cpu = self.input_batch.num_computed_tokens_cpu_tensor[ - :num_reqs - ] - - dcp_local_seq_lens, dcp_local_seq_lens_cpu = None, None - if self.dcp_world_size > 1: - dcp_local_seq_lens = self.dcp_local_seq_lens.gpu[:num_reqs] - dcp_local_seq_lens_cpu = self.dcp_local_seq_lens.cpu[:num_reqs] - - spec_decode_common_attn_metadata = None - if for_cudagraph_capture: # For some attention backends (e.g. FA) with sliding window models we need # to make sure the backend see a max_seq_len that is larger to the sliding @@ -1559,6 +1520,22 @@ class GPUModelRunner( self.num_accepted_tokens.np[num_reqs:].fill(1) self.num_accepted_tokens.copy_to_gpu() + # Used in the below loop, uses padded shapes + query_start_loc = self.query_start_loc.gpu[: num_reqs_padded + 1] + query_start_loc_cpu = self.query_start_loc.cpu[: num_reqs_padded + 1] + seq_lens = self.seq_lens.gpu[:num_reqs_padded] + seq_lens_cpu = self.seq_lens.cpu[:num_reqs_padded] + num_computed_tokens_cpu = self.input_batch.num_computed_tokens_cpu_tensor[ + :num_reqs_padded + ] + + dcp_local_seq_lens, dcp_local_seq_lens_cpu = None, None + if self.dcp_world_size > 1: + dcp_local_seq_lens = self.dcp_local_seq_lens.gpu[:num_reqs_padded] + dcp_local_seq_lens_cpu = self.dcp_local_seq_lens.cpu[:num_reqs_padded] + + spec_decode_common_attn_metadata = None + # Prepare the attention metadata for each KV cache group and make layers # in the same group share the same metadata. for kv_cache_gid, kv_cache_group in enumerate( @@ -1567,30 +1544,31 @@ class GPUModelRunner( encoder_seq_lens, encoder_seq_lens_cpu = self._get_encoder_seq_lens( num_scheduled_tokens or {}, kv_cache_group.kv_cache_spec, - num_reqs, + num_reqs_padded, ) if isinstance(kv_cache_group.kv_cache_spec, EncoderOnlyAttentionSpec): # Encoder-only layers do not have KV cache, so we need to # create a dummy block table and slot mapping for them. blk_table_tensor = torch.zeros( - (num_reqs, 1), + (num_tokens_padded, 1), dtype=torch.int32, device=self.device, ) slot_mapping = torch.zeros( - (total_num_scheduled_tokens,), + (num_tokens_padded,), dtype=torch.int64, device=self.device, ) else: blk_table = self.input_batch.block_table[kv_cache_gid] - blk_table_tensor = blk_table.get_device_tensor(num_reqs) - slot_mapping = blk_table.slot_mapping.gpu[:total_num_scheduled_tokens] + blk_table_tensor = blk_table.get_device_tensor(num_reqs_padded) + slot_mapping = blk_table.slot_mapping.gpu[:num_tokens_padded] # Fill unused with -1. Needed for reshape_and_cache in full cuda - # graph mode. - blk_table.slot_mapping.gpu[total_num_scheduled_tokens:].fill_(-1) + # graph mode. `blk_table_tensor` -1 to match mamba PAD_SLOT_ID + slot_mapping[num_tokens:num_tokens_padded].fill_(-1) + blk_table_tensor[num_reqs:num_reqs_padded].fill_(-1) common_attn_metadata = CommonAttentionMetadata( query_start_loc=query_start_loc, @@ -1598,9 +1576,9 @@ class GPUModelRunner( seq_lens=seq_lens, seq_lens_cpu=seq_lens_cpu, num_computed_tokens_cpu=num_computed_tokens_cpu, - num_reqs=num_reqs, - num_actual_tokens=total_num_scheduled_tokens, - max_query_len=max_num_scheduled_tokens, + num_actual_tokens=num_tokens_padded, + num_reqs=num_reqs_padded, + max_query_len=max_query_len, max_seq_len=max_seq_len, block_table_tensor=blk_table_tensor, slot_mapping=slot_mapping, @@ -1631,9 +1609,11 @@ class GPUModelRunner( extra_attn_metadata_args = {} if use_spec_decode and isinstance(builder, GDNAttentionMetadataBuilder): extra_attn_metadata_args = dict( - num_accepted_tokens=self.num_accepted_tokens.gpu[:num_reqs], + num_accepted_tokens=self.num_accepted_tokens.gpu[ + :num_reqs_padded + ], num_decode_draft_tokens_cpu=self.num_decode_draft_tokens.cpu[ - :num_reqs + :num_reqs_padded ], ) @@ -1677,6 +1657,7 @@ class GPUModelRunner( def _compute_cascade_attn_prefix_lens( self, num_scheduled_tokens: np.ndarray, + num_computed_tokens: np.ndarray, num_common_prefix_blocks: list[int], ) -> list[list[int]] | None: """ @@ -1699,6 +1680,7 @@ class GPUModelRunner( # 0 if cascade attention should not be used cascade_attn_prefix_len = self._compute_cascade_attn_prefix_len( num_scheduled_tokens, + num_computed_tokens, num_common_prefix_blocks[kv_cache_gid], attn_group.kv_cache_spec, attn_group.get_metadata_builder(), @@ -1711,6 +1693,7 @@ class GPUModelRunner( def _compute_cascade_attn_prefix_len( self, num_scheduled_tokens: np.ndarray, + num_computed_tokens: np.ndarray, num_common_prefix_blocks: int, kv_cache_spec: KVCacheSpec, attn_metadata_builder: AttentionMetadataBuilder, @@ -1777,10 +1760,7 @@ class GPUModelRunner( # and the second kernel will get an empty input. While this is not # a fundamental problem, our current implementation does not support # this case. - num_reqs = len(num_scheduled_tokens) - common_prefix_len = min( - common_prefix_len, self.input_batch.num_computed_tokens_cpu[:num_reqs].min() - ) + common_prefix_len = min(common_prefix_len, num_computed_tokens.min()) # common_prefix_len should be a multiple of the block size. common_prefix_len = ( common_prefix_len // kv_cache_spec.block_size * kv_cache_spec.block_size @@ -2334,19 +2314,6 @@ class GPUModelRunner( log_stats=self.parallel_config.eplb_config.log_balancedness, ) - # This is where the second ubatch is adjusted to account for the padding. - # Should be called after attention metadata creation. This just pads - # the second ubatch slice out to the total number of tokens - # (num_tokens + padding) - @staticmethod - def pad_out_ubatch_slice(ubatch_slices: UBatchSlices, num_total_tokens: int): - padded_second_ubatch_slice = slice( - ubatch_slices[1].token_slice.start, num_total_tokens - ) - ubatch_slices[1] = UBatchSlice( - padded_second_ubatch_slice, padded_second_ubatch_slice - ) - def _pool( self, hidden_states: torch.Tensor, @@ -2391,18 +2358,7 @@ class GPUModelRunner( pooler_output=pooler_output, ) - def _get_num_input_tokens(self, num_scheduled_tokens: int) -> int: - if ( - self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE - and hasattr(self, "cudagraph_batch_sizes") - and self.cudagraph_batch_sizes - and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1] - ): - # Use CUDA graphs. - # Add padding to the batch size. - return self.vllm_config.pad_for_cudagraph(num_scheduled_tokens) - - # Eager mode. + def _pad_for_sequence_parallelism(self, num_scheduled_tokens: int) -> int: # Pad tokens to multiple of tensor_parallel_size when # enabled collective fusion for SP tp_size = self.vllm_config.parallel_config.tensor_parallel_size @@ -2738,6 +2694,87 @@ class GPUModelRunner( **model_kwargs, ) + def _determine_batch_execution_and_padding( + self, + num_tokens: int, + num_reqs: int, + num_scheduled_tokens_np: np.ndarray, + max_num_scheduled_tokens: int, + use_cascade_attn: bool, + allow_microbatching: bool = True, + force_eager: bool = False, + # For cudagraph capture TODO(lucas): Refactor how we capture cudagraphs (will + # be improved in model runner v2) + force_uniform_decode: bool | None = None, + force_has_lora: bool | None = None, + ) -> tuple[ + CUDAGraphMode, BatchDescriptor, UBatchSlices | None, torch.Tensor | None + ]: + num_tokens_padded = self._pad_for_sequence_parallelism(num_tokens) + uniform_decode = ( + ( + (max_num_scheduled_tokens == self.uniform_decode_query_len) + and (num_tokens_padded == max_num_scheduled_tokens * num_reqs) + ) + if force_uniform_decode is None + else force_uniform_decode + ) + + has_lora = ( + len(self.input_batch.lora_id_to_lora_request) > 0 + if force_has_lora is None + else force_has_lora + ) + + dispatch_cudagraph = ( + lambda num_tokens: self.cudagraph_dispatcher.dispatch( + num_tokens=num_tokens, + has_lora=has_lora, + use_cascade_attn=use_cascade_attn, + uniform_decode=uniform_decode, + ) + if not force_eager + else (CUDAGraphMode.NONE, BatchDescriptor(num_tokens_padded)) + ) + + cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded) + num_tokens_padded = batch_descriptor.num_tokens + + # Extra coordination when running data-parallel since we need to coordinate + # across ranks + ubatch_slices, num_tokens_across_dp = None, None + if self.vllm_config.parallel_config.data_parallel_size > 1: + # Disable DP padding when running eager to avoid excessive padding when + # running prefills. This lets us set cudagraph_mode="NONE" on the prefiller + # in a P/D setup and still use CUDA graphs (enabled by this padding) on the + # decoder. + allow_dp_padding = ( + self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE + ) + + ubatch_slices, num_tokens_across_dp = coordinate_batch_across_dp( + num_tokens_unpadded=num_tokens_padded, + parallel_config=self.parallel_config, + allow_microbatching=allow_microbatching, + allow_dp_padding=allow_dp_padding, + num_tokens_padded=num_tokens_padded, + uniform_decode=uniform_decode, + num_scheduled_tokens_per_request=num_scheduled_tokens_np, + ) + + # Extract DP padding if there is any + if num_tokens_across_dp is not None: + dp_rank = self.parallel_config.data_parallel_rank + num_tokens_padded = int(num_tokens_across_dp[dp_rank].item()) + + # Re-dispatch with DP padding + cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded) + # Assert to make sure the agreed upon token count is correct otherwise + # num_tokens_across_dp will no-longer be valid + assert batch_descriptor.num_tokens == num_tokens_padded + + return cudagraph_mode, batch_descriptor, ubatch_slices, num_tokens_across_dp + @torch.inference_mode() def execute_model( self, @@ -2790,7 +2827,7 @@ class GPUModelRunner( # returns True. before returning early here we call # dummy run to ensure coordinate_batch_across_dp # is called into to avoid out of sync issues. - self._dummy_run(self._get_num_input_tokens(1)) + self._dummy_run(1) if not has_kv_transfer_group(): # Return empty ModelRunnerOutput if no work to do. return EMPTY_MODEL_RUNNER_OUTPUT @@ -2809,36 +2846,63 @@ class GPUModelRunner( tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids] num_scheduled_tokens_np = np.array(tokens, dtype=np.int32) max_num_scheduled_tokens = int(num_scheduled_tokens_np.max()) + num_tokens_unpadded = scheduler_output.total_num_scheduled_tokens ( logits_indices, spec_decode_metadata, - ubatch_slices, - num_tokens_across_dp, ) = self._prepare_inputs( - scheduler_output, num_scheduled_tokens_np, max_num_scheduled_tokens + scheduler_output, + num_scheduled_tokens_np, ) cascade_attn_prefix_lens = None # Disable cascade attention when using microbatching (DBO) - if self.cascade_attn_enabled and ubatch_slices is None: + if self.cascade_attn_enabled and not self.parallel_config.enable_dbo: # Pre-compute cascade attention prefix lengths - # NOTE: Must be AFTER _prepare_inputs uses self.input_batch state cascade_attn_prefix_lens = self._compute_cascade_attn_prefix_lens( num_scheduled_tokens_np, + self.input_batch.num_computed_tokens_cpu[:num_reqs], scheduler_output.num_common_prefix_blocks, ) - # TODO(lucas): move cudagraph dispatching here: - # https://github.com/vllm-project/vllm/issues/23789 + ( + cudagraph_mode, + batch_desc, + ubatch_slices, + num_tokens_across_dp, + ) = self._determine_batch_execution_and_padding( + num_tokens=num_tokens_unpadded, + num_reqs=num_reqs, + num_scheduled_tokens_np=num_scheduled_tokens_np, + max_num_scheduled_tokens=max_num_scheduled_tokens, + use_cascade_attn=cascade_attn_prefix_lens is not None, + ) + + logger.debug( + "Running batch with cudagraph_mode: %s, batch_descriptor: %s, " + "ubatch_slices: %s, num_tokens_across_dp: %s", + cudagraph_mode, + batch_desc, + ubatch_slices, + num_tokens_across_dp, + ) + + num_tokens_padded = batch_desc.num_tokens + num_reqs_padded = ( + batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs + ) - total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0 - attn_metadata, spec_decode_common_attn_metadata = ( + pad_attn = cudagraph_mode == CUDAGraphMode.FULL + + (attn_metadata, spec_decode_common_attn_metadata) = ( self._build_attention_metadata( - total_num_scheduled_tokens=total_num_scheduled_tokens, - max_num_scheduled_tokens=max_num_scheduled_tokens, + num_tokens=num_tokens_unpadded, + num_tokens_padded=num_tokens_padded if pad_attn else None, num_reqs=num_reqs, + num_reqs_padded=num_reqs_padded if pad_attn else None, + max_query_len=max_num_scheduled_tokens, ubatch_slices=ubatch_slices, logits_indices=logits_indices, use_spec_decode=use_spec_decode, @@ -2847,49 +2911,22 @@ class GPUModelRunner( ) ) - dp_rank = self.parallel_config.data_parallel_rank - if ubatch_slices: - assert num_tokens_across_dp is not None - num_input_tokens = int(num_tokens_across_dp[dp_rank].item()) - self.pad_out_ubatch_slice(ubatch_slices, num_input_tokens) - elif num_tokens_across_dp is not None: - num_input_tokens = int(num_tokens_across_dp[dp_rank].item()) - else: - num_input_tokens = self._get_num_input_tokens( - scheduler_output.total_num_scheduled_tokens - ) - - ( - input_ids, - inputs_embeds, - positions, - intermediate_tensors, - model_kwargs, - ec_connector_output, - ) = self._preprocess( - scheduler_output, num_input_tokens, intermediate_tensors - ) - - uniform_decode = ( - max_num_scheduled_tokens == self.uniform_decode_query_len - ) and (num_scheduled_tokens == num_reqs * max_num_scheduled_tokens) - batch_desc = BatchDescriptor( - num_tokens=num_input_tokens, - uniform_decode=uniform_decode, - has_lora=len(self.input_batch.lora_id_to_lora_request) > 0, - ) - cudagraph_runtime_mode, batch_descriptor = ( - self.cudagraph_dispatcher.dispatch( - batch_desc, - use_cascade_attn=cascade_attn_prefix_lens is not None, - ) + ( + input_ids, + inputs_embeds, + positions, + intermediate_tensors, + model_kwargs, + ec_connector_output, + ) = self._preprocess( + scheduler_output, num_tokens_padded, intermediate_tensors ) # Set cudagraph mode to none if calc_kv_scales is true. # KV scales calculation involves dynamic operations that are incompatible # with CUDA graph capture. if self.calculate_kv_scales: - cudagraph_runtime_mode = CUDAGraphMode.NONE + cudagraph_mode = CUDAGraphMode.NONE # Mark KV scales as calculated after the first forward pass self.calculate_kv_scales = False @@ -2899,10 +2936,10 @@ class GPUModelRunner( set_forward_context( attn_metadata, self.vllm_config, - num_tokens=num_input_tokens, + num_tokens=num_tokens_padded, num_tokens_across_dp=num_tokens_across_dp, - cudagraph_runtime_mode=cudagraph_runtime_mode, - batch_descriptor=batch_descriptor, + cudagraph_runtime_mode=cudagraph_mode, + batch_descriptor=batch_desc, ubatch_slices=ubatch_slices, ), record_function_or_nullcontext("gpu_model_runner: forward"), @@ -2952,7 +2989,7 @@ class GPUModelRunner( if not get_pp_group().is_last_rank: all_gather_tensors = { "residual": not is_residual_scattered_for_sp( - self.vllm_config, num_input_tokens + self.vllm_config, num_tokens_padded ) } get_pp_group().send_tensor_dict( @@ -3841,52 +3878,44 @@ class GPUModelRunner( assert sum(num_scheduled_tokens_list) == num_tokens assert len(num_scheduled_tokens_list) == num_reqs num_scheduled_tokens = np.array(num_scheduled_tokens_list, dtype=np.int32) - total_num_scheduled_tokens = int(num_scheduled_tokens.sum()) + num_tokens_unpadded = int(num_scheduled_tokens.sum()) + num_sampled_tokens = np.ones(num_reqs, dtype=np.int32) - # Disable DP padding when running eager - allow_dp_padding = self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE - - # We currently only microbatch if the number of tokens is - # over a certain threshold. - ubatch_slices, num_tokens_across_dp = coordinate_batch_across_dp( - num_tokens_unpadded=total_num_scheduled_tokens, - parallel_config=self.vllm_config.parallel_config, - allow_microbatching=allow_microbatching, - allow_dp_padding=allow_dp_padding, - num_tokens_padded=total_num_scheduled_tokens, - uniform_decode=uniform_decode, - num_scheduled_tokens_per_request=num_scheduled_tokens, - ) - num_tokens_after_padding = num_tokens - if num_tokens_across_dp is not None: - dp_rank = self.parallel_config.data_parallel_rank - num_tokens_after_padding = int(num_tokens_across_dp[dp_rank]) - - # filter out the valid batch descriptor - _cg_mode, batch_descriptor = ( - self.cudagraph_dispatcher.dispatch( - BatchDescriptor( - num_tokens=num_tokens_after_padding, - uniform_decode=uniform_decode, - has_lora=activate_lora and self.lora_config is not None, - ) + _cudagraph_mode, batch_desc, ubatch_slices, num_tokens_across_dp = ( + self._determine_batch_execution_and_padding( + num_tokens=num_tokens_unpadded, + num_reqs=num_reqs, + num_scheduled_tokens_np=num_scheduled_tokens, + max_num_scheduled_tokens=max_query_len, + use_cascade_attn=False, + allow_microbatching=allow_microbatching, + force_eager=is_profile + or (cudagraph_runtime_mode == CUDAGraphMode.NONE), + # `force_uniform_decode` is used for cudagraph capture; because for + # capturing mixed prefill-decode batches, we sometimes use + # num_tokens == num_reqs which looks like a uniform decode batch to the + # dispatcher; but we actually want to capture a piecewise cudagraph + force_uniform_decode=uniform_decode, + # `force_has_lora` is used for cudagraph capture; because LoRA is + # activated later in the context manager, but we need to know the + # LoRA state when determining the batch descriptor for capture + force_has_lora=activate_lora, ) - if not is_profile - else (CUDAGraphMode.NONE, None) ) - if cudagraph_runtime_mode is not None: - # we allow forcing NONE when the dispatcher disagrees to support - # warm ups for cudagraph capture - assert ( - cudagraph_runtime_mode == CUDAGraphMode.NONE - or cudagraph_runtime_mode == _cg_mode - ), ( - f"Cudagraph runtime mode mismatch at dummy_run. " - f"Expected {_cg_mode}, but got {cudagraph_runtime_mode}." - ) + + if cudagraph_runtime_mode is None: + cudagraph_runtime_mode = _cudagraph_mode else: - cudagraph_runtime_mode = _cg_mode + assert cudagraph_runtime_mode == _cudagraph_mode, ( + f"Cudagraph runtime mode mismatch in dummy_run. " + f"Expected {_cudagraph_mode}, but got {cudagraph_runtime_mode}." + ) + + num_tokens_padded = batch_desc.num_tokens + num_reqs_padded = ( + batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs + ) attn_metadata: PerLayerAttnMetadata | None = None @@ -3909,9 +3938,9 @@ class GPUModelRunner( self.query_start_loc.copy_to_gpu() attn_metadata, _ = self._build_attention_metadata( - total_num_scheduled_tokens=num_tokens, - max_num_scheduled_tokens=max_query_len, - num_reqs=num_reqs, + num_tokens=num_tokens_unpadded, + num_reqs=num_reqs_padded, + max_query_len=max_query_len, ubatch_slices=ubatch_slices, for_cudagraph_capture=True, ) @@ -3924,29 +3953,29 @@ class GPUModelRunner( remove_lora, ): # Make sure padding doesn't exceed max_num_tokens - assert num_tokens_after_padding <= self.max_num_tokens - model_kwargs = self._init_model_kwargs(num_tokens_after_padding) + assert num_tokens_padded <= self.max_num_tokens + model_kwargs = self._init_model_kwargs(num_tokens_padded) if self.supports_mm_inputs and not self.model_config.is_encoder_decoder: input_ids = None - inputs_embeds = self.inputs_embeds.gpu[:num_tokens_after_padding] + inputs_embeds = self.inputs_embeds.gpu[:num_tokens_padded] model_kwargs = { **model_kwargs, **self._dummy_mm_kwargs(num_reqs), } elif self.enable_prompt_embeds: input_ids = None - inputs_embeds = self.inputs_embeds.gpu[:num_tokens_after_padding] - model_kwargs = self._init_model_kwargs(num_tokens_after_padding) + inputs_embeds = self.inputs_embeds.gpu[:num_tokens_padded] + model_kwargs = self._init_model_kwargs(num_tokens_padded) else: - input_ids = self.input_ids.gpu[:num_tokens_after_padding] + input_ids = self.input_ids.gpu[:num_tokens_padded] inputs_embeds = None if self.uses_mrope: - positions = self.mrope_positions.gpu[:, :num_tokens_after_padding] + positions = self.mrope_positions.gpu[:, :num_tokens_padded] elif self.uses_xdrope_dim > 0: - positions = self.xdrope_positions.gpu[:, :num_tokens_after_padding] + positions = self.xdrope_positions.gpu[:, :num_tokens_padded] else: - positions = self.positions.gpu[:num_tokens_after_padding] + positions = self.positions.gpu[:num_tokens_padded] if get_pp_group().is_first_rank: intermediate_tensors = None @@ -3961,26 +3990,26 @@ class GPUModelRunner( ) intermediate_tensors = self.sync_and_slice_intermediate_tensors( - num_tokens_after_padding, None, False + num_tokens_padded, None, False ) if ubatch_slices is not None: # Adjust values to reflect a single ubatch. # TODO(sage,lucas): this is cruft that should be addressed in # the padding refactor. - num_tokens_after_padding = ubatch_slices[0].num_tokens + num_tokens_padded = ubatch_slices[0].num_tokens if num_tokens_across_dp is not None: - num_tokens_across_dp[:] = num_tokens_after_padding + num_tokens_across_dp[:] = num_tokens_padded with ( self.maybe_randomize_inputs(input_ids), set_forward_context( attn_metadata, self.vllm_config, - num_tokens=num_tokens_after_padding, + num_tokens=num_tokens_padded, num_tokens_across_dp=num_tokens_across_dp, cudagraph_runtime_mode=cudagraph_runtime_mode, - batch_descriptor=batch_descriptor, + batch_descriptor=batch_desc, ubatch_slices=ubatch_slices, ), ): @@ -4706,8 +4735,7 @@ class GPUModelRunner( # Trigger cudagraph dispatching keys initialization after # resolved cudagraph mode. - cudagraph_mode = self.compilation_config.cudagraph_mode - assert cudagraph_mode is not None + self.compilation_config.cudagraph_mode = cudagraph_mode self.cudagraph_dispatcher.initialize_cudagraph_keys( cudagraph_mode, self.uniform_decode_query_len ) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 6a4bfde5f972b..d0c6091ce2a6e 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -8,12 +8,13 @@ from contextlib import AbstractContextManager, nullcontext from types import NoneType from typing import TYPE_CHECKING, Any, cast +import numpy as np import torch import torch.distributed import torch.nn as nn import vllm.envs as envs -from vllm.config import VllmConfig +from vllm.config import CUDAGraphMode, VllmConfig from vllm.distributed import ( ensure_model_parallel_initialized, init_distributed_environment, @@ -487,6 +488,7 @@ class Worker(WorkerBase): hidden_states, last_hidden_states = self.model_runner._dummy_run( num_tokens=max_num_reqs, skip_eplb=True, + cudagraph_runtime_mode=CUDAGraphMode.NONE, ) if self.model_runner.is_pooling_model: self.model_runner._dummy_pooler_run(hidden_states) @@ -534,12 +536,39 @@ class Worker(WorkerBase): intermediate_tensors = None forward_pass = scheduler_output.total_num_scheduled_tokens > 0 num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens - num_input_tokens = self.model_runner._get_num_input_tokens(num_scheduled_tokens) - all_gather_tensors = { - "residual": not is_residual_scattered_for_sp( - self.vllm_config, num_input_tokens + all_gather_tensors = {} + compilation_config = self.vllm_config.compilation_config + parallel_config = self.vllm_config.parallel_config + + if ( + parallel_config.pipeline_parallel_size > 1 + and compilation_config.pass_config.enable_sequence_parallelism + and forward_pass + ): + # currently only supported by V1 GPUModelRunner + assert isinstance(self.model_runner, GPUModelRunner) + num_scheduled_tokens_np = np.array( + list(scheduler_output.num_scheduled_tokens.values()), + dtype=np.int32, ) - } + # TODO(lucas): This is pretty gross; ideally we should only ever call + # `_determine_batch_execution_and_padding` once (will get called again + # in `execute_model`) but this requires a larger refactor of PP. + _, batch_desc, _, _ = ( + self.model_runner._determine_batch_execution_and_padding( + num_tokens=num_scheduled_tokens, + num_reqs=len(num_scheduled_tokens_np), + num_scheduled_tokens_np=num_scheduled_tokens_np, + max_num_scheduled_tokens=num_scheduled_tokens_np.max(), + use_cascade_attn=False, # TODO(lucas): Handle cascade attention + ) + ) + all_gather_tensors = { + "residual": not is_residual_scattered_for_sp( + self.vllm_config, batch_desc.num_tokens + ) + } + if forward_pass and not get_pp_group().is_first_rank: tensor_dict = get_pp_group().recv_tensor_dict( all_gather_group=get_tp_group(), From ba1fcd84a7f1dc907c17bf4ba4fab6762a9f33a1 Mon Sep 17 00:00:00 2001 From: Johnny Yang <24908445+jcyang43@users.noreply.github.com> Date: Wed, 26 Nov 2025 14:46:36 -0800 Subject: [PATCH 012/197] [TPU] add tpu_inference (#27277) Signed-off-by: Johnny Yang --- requirements/tpu.txt | 4 +--- vllm/distributed/device_communicators/tpu_communicator.py | 8 -------- vllm/platforms/tpu.py | 4 +++- vllm/v1/worker/tpu_worker.py | 2 +- 4 files changed, 5 insertions(+), 13 deletions(-) diff --git a/requirements/tpu.txt b/requirements/tpu.txt index 4241cbb2b0333..e6fff58f7b794 100644 --- a/requirements/tpu.txt +++ b/requirements/tpu.txt @@ -12,6 +12,4 @@ ray[data] setuptools==78.1.0 nixl==0.3.0 tpu_info==0.4.0 - -# Install torch_xla -torch_xla[tpu, pallas]==2.8.0 \ No newline at end of file +tpu-inference==0.11.1 diff --git a/vllm/distributed/device_communicators/tpu_communicator.py b/vllm/distributed/device_communicators/tpu_communicator.py index a7724a86cc6a5..fa99078e9ff0d 100644 --- a/vllm/distributed/device_communicators/tpu_communicator.py +++ b/vllm/distributed/device_communicators/tpu_communicator.py @@ -97,11 +97,3 @@ class TpuCommunicator(DeviceCommunicatorBase): def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor: assert dim == -1, "TPUs only support dim=-1 for all-gather." return xm.all_gather(input_, dim=dim) - - -if USE_TPU_INFERENCE: - from tpu_inference.distributed.device_communicators import ( - TpuCommunicator as TpuInferenceCommunicator, - ) - - TpuCommunicator = TpuInferenceCommunicator # type: ignore diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index 944344a229578..aa5ddbe43659d 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -267,7 +267,9 @@ class TpuPlatform(Platform): try: - from tpu_inference.platforms import TpuPlatform as TpuInferencePlatform + from tpu_inference.platforms.tpu_platforms import ( + TpuPlatform as TpuInferencePlatform, + ) TpuPlatform = TpuInferencePlatform # type: ignore USE_TPU_INFERENCE = True diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index e1a109eca0a88..ce18ca6c37165 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -346,6 +346,6 @@ class TPUWorker: if USE_TPU_INFERENCE: - from tpu_inference.worker import TPUWorker as TpuInferenceWorker + from tpu_inference.worker.tpu_worker import TPUWorker as TpuInferenceWorker TPUWorker = TpuInferenceWorker # type: ignore From df01eda4dc570dbf9aa45dd196e288d13f427fab Mon Sep 17 00:00:00 2001 From: HDCharles <39544797+HDCharles@users.noreply.github.com> Date: Wed, 26 Nov 2025 21:35:13 -0500 Subject: [PATCH 013/197] [Bugfix] Make compressed-tensors MoEs respect ignored layers (#28878) Signed-off-by: HDCharles --- .buildkite/test-pipeline.yaml | 1 + tests/quantization/test_compressed_tensors.py | 48 +++++++++++++ .../layers/fused_moe/__init__.py | 4 ++ .../compressed_tensors/compressed_tensors.py | 72 ++++++++++++++----- .../compressed_tensors_moe.py | 60 +++++++--------- 5 files changed, 133 insertions(+), 52 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index d14b524b793a5..375645fde7477 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -632,6 +632,7 @@ steps: # we can only upgrade after this is resolved # TODO(jerryzh168): resolve the above comment - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129 + - uv pip install --system conch-triton-kernels - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py - label: LM Eval Small Models # 53min diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 31b65189b5ec3..412b21328a325 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -10,6 +10,7 @@ import torch from compressed_tensors.quantization import QuantizationType from tests.models.utils import check_logprobs_close +from vllm.model_executor.layers.fused_moe import UnquantizedFusedMoEMethod from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 CompressedTensors24, CompressedTensorsLinearMethod, @@ -767,3 +768,50 @@ def test_compressed_tensors_fp8_block_enabled(vllm_runner): output = llm.generate_greedy("Hello my name is", max_tokens=4) assert output + + +@pytest.mark.skipif( + not current_platform.is_cuda(), + reason="This test is not for non-CUDA platforms", +) +def test_compressed_tensors_moe_ignore_with_model(vllm_runner): + """ + Integration test for MoE layer ignore functionality with a real model. + + This test would verify that when loading a compressed-tensors quantized + MoE model where some MoE layers are in the ignore list, those layers + use UnquantizedFusedMoEMethod while non-ignored layers use the + quantized method. + + Expected model structure: + - Compressed-tensors quantized MoE model (e.g., Mixtral-based) + - Config with ignore list containing specific MoE layers + - Multiple MoE layers where some are quantized and some are not + """ + + # model_path = "nm-testing/tinysmokeqwen3moe-W4A16-first-only" # CT 12.3 + model_path = "nm-testing/tinysmokeqwen3moe-W4A16-first-only-CTstable" # CT 12.2 + + with vllm_runner(model_path, enforce_eager=True) as llm: + + def check_model(model): + from vllm.model_executor.layers.fused_moe import FusedMoE + from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import ( # noqa: E501 + CompressedTensorsMoEMethod, + ) + + # Check layer 0 MoE (should be quantized) + layer_quantized = model.model.layers[0].mlp.experts + assert isinstance(layer_quantized, FusedMoE) + assert isinstance(layer_quantized.quant_method, CompressedTensorsMoEMethod) + + # Check layer 10 MoE (should be unquantized + ignored) + layer_unquantized = model.model.layers[3].mlp.experts + assert isinstance(layer_unquantized, FusedMoE) + assert isinstance(layer_unquantized.quant_method, UnquantizedFusedMoEMethod) + + llm.apply_model(check_model) + + # Verify the model can generate output + output = llm.generate_greedy("Hello, my name is", max_tokens=4) + assert output diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index 53d98d0650b43..669abcb3d6ff1 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -18,6 +18,9 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import ( FusedMoEPrepareAndFinalize, ) from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE +from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import ( + UnquantizedFusedMoEMethod, +) from vllm.model_executor.layers.fused_moe.utils import activation_without_mul from vllm.triton_utils import HAS_TRITON @@ -41,6 +44,7 @@ __all__ = [ "FusedMoE", "FusedMoEConfig", "FusedMoEMethodBase", + "UnquantizedFusedMoEMethod", "FusedMoeWeightScaleSupported", "FusedMoEPermuteExpertsUnpermute", "FusedMoEActivationFormat", diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 2800f90ce0b67..7f61746a4e45c 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -158,9 +158,23 @@ class CompressedTensorsConfig(QuantizationConfig): if isinstance(layer, Attention): return CompressedTensorsKVCacheMethod(self) if isinstance(layer, FusedMoE): - return CompressedTensorsMoEMethod.get_moe_method(self, layer) + return CompressedTensorsMoEMethod.get_moe_method(self, layer, prefix) return None + def _add_fused_moe_to_target_scheme_map(self): + """ + Helper function to update target_scheme_map + since linear layers get fused into FusedMoE + targetting 'Linear' needs to also match + FusedMoE modules. + """ + if ( + "Linear" not in self.target_scheme_map + or "FusedMoE" in self.target_scheme_map + ): + return + self.target_scheme_map["FusedMoE"] = self.target_scheme_map["Linear"] + @classmethod def from_config(cls, config: dict[str, Any]) -> "CompressedTensorsConfig": ignore: list[str] = cast(list[str], config.get("ignore", [])) @@ -655,25 +669,13 @@ class CompressedTensorsConfig(QuantizationConfig): to select the CompressedTensorsScheme used for inference. """ - # Find the "target" in the compressed-tensors config - # that our layer conforms to. - # TODO (@kylesayrs): support ignore module names with ct matching utils - if should_ignore_layer( - layer_name, ignore=self.ignore, fused_mapping=self.packed_modules_mapping - ): - return None + # Use the new get_quant_args method to extract QuantizationArgs + scheme_dict = self.get_scheme_dict(layer, layer_name) - # Will be empty for models with only sparsity - weight_quant = input_quant = None - if self.target_scheme_map: - matched_target = find_matched_target( - layer_name=layer_name, - module=layer, - targets=self.target_scheme_map.keys(), - fused_mapping=self.packed_modules_mapping, - ) - - scheme_dict = self.target_scheme_map[matched_target] + weight_quant = None + input_quant = None + format = None + if scheme_dict: weight_quant = scheme_dict.get("weights") input_quant = scheme_dict.get("input_activations") format = scheme_dict.get("format") @@ -732,6 +734,38 @@ class CompressedTensorsConfig(QuantizationConfig): logger.debug("Using scheme: %s for %s", scheme.__class__.__name__, layer_name) return scheme + def get_scheme_dict( + self, layer: torch.nn.Module, layer_name: str | None = None + ) -> dict[str, QuantizationArgs | str | None] | None: + """ + Extract the QuantizationArgs for a given layer. + + Returns: + dict with { + "weights": QuantizationArgs, + "input_activations": QuantizationArgs | None, + "format": str | None + } | None + """ + # TODO (@kylesayrs): support ignore module names with ct matching utils + if should_ignore_layer( + layer_name, ignore=self.ignore, fused_mapping=self.packed_modules_mapping + ): + return None + + # Will be empty for models with only sparsity + if self.target_scheme_map: + matched_target = find_matched_target( + layer_name=layer_name, + module=layer, + targets=self.target_scheme_map.keys(), + fused_mapping=self.packed_modules_mapping, + ) + + return self.target_scheme_map[matched_target] + + return None + def get_cache_scale(self, name: str) -> str | None: """ Check whether the param name matches the format for k/v cache scales diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 71d7de97d4a10..c7dfd1787cc8f 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -23,6 +23,7 @@ from vllm.model_executor.layers.fused_moe import ( FusedMoEMethodBase, FusedMoEPermuteExpertsUnpermute, FusedMoeWeightScaleSupported, + UnquantizedFusedMoEMethod, ) from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, @@ -45,9 +46,6 @@ from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compress WNA16_SUPPORTED_BITS, WNA16_SUPPORTED_TYPES_MAP, ) -from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - find_matched_target, -) from vllm.model_executor.layers.quantization.utils import replace_parameter from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( build_flashinfer_fp4_cutlass_moe_prepare_finalize, @@ -113,39 +111,35 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase): def get_moe_method( quant_config: "CompressedTensorsConfig", # type: ignore # noqa E501 layer: torch.nn.Module, + prefix: str, ) -> "CompressedTensorsMoEMethod": + # FusedMoE was made by combining multiple Linears so need to + # make sure quantization config for Linear can target it + quant_config._add_fused_moe_to_target_scheme_map() + unfused_names = [ + prefix + proj_name + for proj_name in [".0.gate_proj", ".0.up_proj", ".0.down_proj"] + ] + # TODO: refactor this to use expert_mapping and check all layer numbers + all_scheme_dicts = [ + quant_config.get_scheme_dict(layer, name) for name in unfused_names + ] + scheme_dict = all_scheme_dicts.pop() + + # multiple schemes found + if not all([cur_dict == scheme_dict for cur_dict in all_scheme_dicts]): + raise ValueError( + "All MoE projections need to have same " + "quantization scheme but found multiple" + ) + + if scheme_dict is None: # ignored layer + return UnquantizedFusedMoEMethod(layer.moe_config) + # TODO: @dsikka: refactor this to use schemes as other kernels # are supported + check if the layer is being ignored. - # Check if a using "Linear" to select schemes - if "Linear" in quant_config.target_scheme_map: - matched_target = "Linear" - else: - # May have instead defined the linear layers in the fused model - - fused_layers = ["re:.*down_proj.*", "re:.*gate_proj.*", "re:.*up_proj.*"] - current_scheme = None - for fused_layer in fused_layers: - # Check if one of the fused layers are defined in quant_config - matched_target = find_matched_target( - layer_name=fused_layer, - module=layer, - targets=quant_config.target_scheme_map.keys(), - fused_mapping=quant_config.packed_modules_mapping, - ) - - # Only valid if down_proj, gate_proj, and up_proj - # are mapped to the same quant scheme in the quant_config - if current_scheme is None: - current_scheme = quant_config.target_scheme_map.get(matched_target) - else: - assert current_scheme == quant_config.target_scheme_map.get( - matched_target - ) - - weight_quant = quant_config.target_scheme_map[matched_target].get("weights") - input_quant = quant_config.target_scheme_map[matched_target].get( - "input_activations" - ) + weight_quant = scheme_dict.get("weights") + input_quant = scheme_dict.get("input_activations") if quant_config._is_wNa16_group_channel(weight_quant, input_quant): # group_size=None means channelwise From 77740191de965329e143e501321637a3e242e2f6 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Wed, 26 Nov 2025 21:48:43 -0500 Subject: [PATCH 014/197] [Attention][Async] Eliminate `seq_lens_cpu` in FlashAttention metadata building with DCP > 1 (#29449) Signed-off-by: Matthew Bonanni --- vllm/v1/attention/backends/flash_attn.py | 27 ++++++++++++------------ vllm/v1/attention/backends/utils.py | 6 ++++-- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 0fc57cfb1f9d3..a1558073003fd 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -328,7 +328,6 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad max_seq_len = common_attn_metadata.max_seq_len query_start_loc = common_attn_metadata.query_start_loc seq_lens = common_attn_metadata.seq_lens - seq_lens_cpu = common_attn_metadata.seq_lens_cpu block_table_tensor = common_attn_metadata.block_table_tensor slot_mapping = common_attn_metadata.slot_mapping causal = common_attn_metadata.causal @@ -401,20 +400,23 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad prefix_scheduler_metadata = None if self.dcp_world_size > 1: - query_kv_lens_cpu = ( - common_attn_metadata.query_start_loc_cpu[1:] - - common_attn_metadata.query_start_loc_cpu[:-1] - ) - dcp_context_kv_lens_cpu = seq_lens_cpu - query_kv_lens_cpu + query_kv_lens = query_start_loc[1:] - query_start_loc[:-1] + dcp_context_kv_lens = seq_lens - query_kv_lens - dcp_context_kv_lens_cpu = get_dcp_local_seq_lens( - dcp_context_kv_lens_cpu, + dcp_context_kv_lens = get_dcp_local_seq_lens( + dcp_context_kv_lens, self.dcp_world_size, self.dcp_rank, self.cp_kv_cache_interleave_size, ) - dcp_context_kv_lens = dcp_context_kv_lens_cpu.to(self.device) - max_dcp_context_kv_len = dcp_context_kv_lens.max().item() + # After DCP distribution, the maximum number of tokens for any rank is + # ceil(L / (N * I)) * I, where L is max_seq_len, N is dcp_world_size, + # and I is cp_kv_cache_interleave_size. + # This eliminates GPU->CPU sync while minimizing workspace over-allocation. + num_partitions = self.dcp_world_size * self.cp_kv_cache_interleave_size + max_dcp_context_kv_len = ( + (max_seq_len + num_partitions - 1) // num_partitions + ) * self.cp_kv_cache_interleave_size scheduler_metadata = schedule( batch_size=num_reqs, @@ -431,9 +433,8 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad prefix_kv_lens = torch.tensor( [common_prefix_len], dtype=torch.int32, device=self.device ) - suffix_kv_lens = (seq_lens_cpu[:num_reqs] - common_prefix_len).to( - self.device, non_blocking=True - ) + # Use GPU tensor directly - no CPU sync needed + suffix_kv_lens = seq_lens[:num_reqs] - common_prefix_len prefix_scheduler_metadata = schedule( batch_size=1, cu_query_lens=cu_prefix_query_lens, diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 18e91fd4fd6a5..ea9dccc702a0a 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -1095,12 +1095,14 @@ def get_dcp_local_seq_lens( num_requests = seq_lens.size(0) if dcp_rank is None: rank_offsets = ( - torch.arange(dcp_size, dtype=torch.int32) + torch.arange(dcp_size, dtype=torch.int32, device=seq_lens.device) .unsqueeze(0) .repeat(num_requests, 1) ) else: - rank_offsets = torch.Tensor([[dcp_rank]]).to(dtype=torch.int32) + rank_offsets = torch.tensor( + [[dcp_rank]], dtype=torch.int32, device=seq_lens.device + ) seq_lens_tiled = ( seq_lens.to(torch.int32).unsqueeze(-1).repeat(1, rank_offsets.shape[1]) ) From a67dec7cba6239022b5b713845e29d9cbb294ec7 Mon Sep 17 00:00:00 2001 From: Jinzhen Lin Date: Thu, 27 Nov 2025 11:02:21 +0800 Subject: [PATCH 015/197] [Bugfix] fix IMA issue in certain cases of the moe marlin kernel (#28619) Signed-off-by: Jinzhen Lin Co-authored-by: youkaichao Co-authored-by: Michael Goin Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> --- csrc/moe/marlin_moe_wna16/marlin_template.h | 18 ++++++++++-------- .../layers/fused_moe/shared_fused_moe.py | 1 - 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/csrc/moe/marlin_moe_wna16/marlin_template.h b/csrc/moe/marlin_moe_wna16/marlin_template.h index dd86a9a5ba6e9..4dbca30da57a1 100644 --- a/csrc/moe/marlin_moe_wna16/marlin_template.h +++ b/csrc/moe/marlin_moe_wna16/marlin_template.h @@ -489,14 +489,16 @@ __global__ void Marlin( #pragma unroll for (int i = 0; i < 4; i++) { int idx = tid4 * 4 + i; - idx = idx < block_num_valid_tokens ? idx : 0; - if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) { - sh_block_topk_weights[idx] = __hmul2( - global_scale, Dtype::num2num2(Dtype::float2num( - topk_weights_ptr[sh_block_sorted_ids[idx]]))); - } else { - sh_block_topk_weights[idx] = Dtype::num2num2( - Dtype::float2num(topk_weights_ptr[sh_block_sorted_ids[idx]])); + if (idx < block_num_valid_tokens) { + if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) { + sh_block_topk_weights[idx] = + __hmul2(global_scale, + Dtype::num2num2(Dtype::float2num( + topk_weights_ptr[sh_block_sorted_ids[idx]]))); + } else { + sh_block_topk_weights[idx] = Dtype::num2num2( + Dtype::float2num(topk_weights_ptr[sh_block_sorted_ids[idx]])); + } } } } diff --git a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py index 6ec8b33ed9309..9aaeec4f98a61 100644 --- a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py @@ -38,7 +38,6 @@ class SharedFusedMoE(FusedMoE): # TODO(wentao): find the root cause and remove this condition self.enable_eplb or (self.moe_config.use_flashinfer_cutlass_kernels and self.dp_size > 1) - or self.use_marlin_kernels ) and self._shared_experts is not None ) From 9bb33c8919024a50e48687c08d706df3fa3302ed Mon Sep 17 00:00:00 2001 From: Louie Tsai Date: Wed, 26 Nov 2025 19:30:50 -0800 Subject: [PATCH 016/197] add xpu supported model and model id for cpu (#29380) Signed-off-by: Tsai, Louie --- docs/models/hardware_supported_models/cpu.md | 26 +++++--- docs/models/hardware_supported_models/xpu.md | 65 ++++++++++++++++++++ 2 files changed, 82 insertions(+), 9 deletions(-) create mode 100644 docs/models/hardware_supported_models/xpu.md diff --git a/docs/models/hardware_supported_models/cpu.md b/docs/models/hardware_supported_models/cpu.md index 0832755f8fbe2..811778b2ad529 100644 --- a/docs/models/hardware_supported_models/cpu.md +++ b/docs/models/hardware_supported_models/cpu.md @@ -1,25 +1,33 @@ # CPU - Intel® Xeon® +## Validated Hardware + +| Hardware | +| ----------------------------------------- | +| [Intel® Xeon® 6 Processors](https://www.intel.com/content/www/us/en/products/details/processors/xeon.html) | +| [Intel® Xeon® 5 Processors](https://www.intel.com/content/www/us/en/products/docs/processors/xeon/5th-gen-xeon-scalable-processors.html) | + ## Supported Models ### Text-only Language Models | Model | Architecture | Supported | |--------------------------------------|-------------------------------------------|-----------| -| meta-llama/Llama-3.1 / 3.3 | LlamaForCausalLM | ✅ | -| meta-llama/Llama-4-Scout | Llama4ForConditionalGeneration | ✅ | -| meta-llama/Llama-4-Maverick | Llama4ForConditionalGeneration | ✅ | -| ibm-granite/granite (Granite-MOE) | GraniteMoeForCausalLM | ✅ | -| Qwen/Qwen3 | Qwen3ForCausalLM | ✅ | -| zai-org/GLM-4.5 | GLMForCausalLM | ✅ | -| google/gemma | GemmaForCausalLM | ✅ | +| meta-llama/Llama-3.1-8B-Instruct | LlamaForCausalLM | ✅ | +| meta-llama/Llama-3.2-3B-Instruct | LlamaForCausalLM | ✅ | +| ibm-granite/granite-3.2-2b-instruct | GraniteForCausalLM | ✅ | +| Qwen/Qwen3-1.7B | Qwen3ForCausalLM | ✅ | +| Qwen/Qwen3-4B | Qwen3ForCausalLM | ✅ | +| Qwen/Qwen3-8B | Qwen3ForCausalLM | ✅ | +| zai-org/glm-4-9b-hf | GLMForCausalLM | ✅ | +| google/gemma-7b | GemmaForCausalLM | ✅ | ### Multimodal Language Models | Model | Architecture | Supported | |--------------------------------------|-------------------------------------------|-----------| -| Qwen/Qwen2.5-VL | Qwen2VLForConditionalGeneration | ✅ | -| openai/whisper | WhisperForConditionalGeneration | ✅ | +| Qwen/Qwen2.5-VL-7B-Instruct | Qwen2VLForConditionalGeneration | ✅ | +| openai/whisper-large-v3 | WhisperForConditionalGeneration | ✅ | ✅ Runs and optimized. 🟨 Runs and correct but not optimized to green yet. diff --git a/docs/models/hardware_supported_models/xpu.md b/docs/models/hardware_supported_models/xpu.md new file mode 100644 index 0000000000000..7b8dcf5c9af26 --- /dev/null +++ b/docs/models/hardware_supported_models/xpu.md @@ -0,0 +1,65 @@ +# XPU - Intel® GPUs + +## Validated Hardware + +| Hardware | +| ----------------------------------------- | +| [Intel® Arc™ Pro B-Series Graphics](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/workstations/b-series/overview.html) | + +## Supported Models + +### Text-only Language Models + +| Model | Architecture | FP16 | Dynamic FP8 | MXFP4 | +| ----------------------------------------- | ---------------------------------------------------- | ---- | ----------- | ----- | +| openai/gpt-oss-20b | GPTForCausalLM | | | ✅ | +| openai/gpt-oss-120b | GPTForCausalLM | | | ✅ | +| deepseek-ai/DeepSeek-R1-Distill-Llama-8B | LlamaForCausalLM | ✅ | ✅ | | +| deepseek-ai/DeepSeek-R1-Distill-Qwen-14B | QwenForCausalLM | ✅ | ✅ | | +| deepseek-ai/DeepSeek-R1-Distill-Qwen-32B | QwenForCausalLM | ✅ | ✅ | | +| deepseek-ai/DeepSeek-R1-Distill-Llama-70B | LlamaForCausalLM | ✅ | ✅ | | +| Qwen/Qwen2.5-72B-Instruct | Qwen2ForCausalLM | ✅ | ✅ | | +| Qwen/Qwen3-14B | Qwen3ForCausalLM | ✅ | ✅ | | +| Qwen/Qwen3-32B | Qwen3ForCausalLM | ✅ | ✅ | | +| Qwen/Qwen3-30B-A3B | Qwen3ForCausalLM | ✅ | ✅ | | +| Qwen/Qwen3-30B-A3B-GPTQ-Int4 | Qwen3ForCausalLM | ✅ | ✅ | | +| Qwen/Qwen3-coder-30B-A3B-Instruct | Qwen3ForCausalLM | ✅ | ✅ | | +| Qwen/QwQ-32B | QwenForCausalLM | ✅ | ✅ | | +| deepseek-ai/DeepSeek-V2-Lite | DeepSeekForCausalLM | ✅ | ✅ | | +| meta-llama/Llama-3.1-8B-Instruct | LlamaForCausalLM | ✅ | ✅ | | +| baichuan-inc/Baichuan2-13B-Chat | BaichuanForCausalLM | ✅ | ✅ | | +| THUDM/GLM-4-9B-chat | GLMForCausalLM | ✅ | ✅ | | +| THUDM/CodeGeex4-All-9B | CodeGeexForCausalLM | ✅ | ✅ | | +| chuhac/TeleChat2-35B | LlamaForCausalLM (TeleChat2 based on Llama arch) | ✅ | ✅ | | +| 01-ai/Yi1.5-34B-Chat | YiForCausalLM | ✅ | ✅ | | +| THUDM/CodeGeex4-All-9B | CodeGeexForCausalLM | ✅ | ✅ | | +| deepseek-ai/DeepSeek-Coder-33B-base | DeepSeekCoderForCausalLM | ✅ | ✅ | | +| baichuan-inc/Baichuan2-13B-Chat | BaichuanForCausalLM | ✅ | ✅ | | +| meta-llama/Llama-2-13b-chat-hf | LlamaForCausalLM | ✅ | ✅ | | +| THUDM/CodeGeex4-All-9B | CodeGeexForCausalLM | ✅ | ✅ | | +| Qwen/Qwen1.5-14B-Chat | QwenForCausalLM | ✅ | ✅ | | +| Qwen/Qwen1.5-32B-Chat | QwenForCausalLM | ✅ | ✅ | | + +### Multimodal Language Models + +| Model | Architecture | FP16 | Dynamic FP8 | MXFP4 | +| ---------------------------- | -------------------------------- | ---- | ----------- | ----- | +| OpenGVLab/InternVL3_5-8B | InternVLForConditionalGeneration | ✅ | ✅ | | +| OpenGVLab/InternVL3_5-14B | InternVLForConditionalGeneration | ✅ | ✅ | | +| OpenGVLab/InternVL3_5-38B | InternVLForConditionalGeneration | ✅ | ✅ | | +| Qwen/Qwen2-VL-7B-Instruct | Qwen2VLForConditionalGeneration | ✅ | ✅ | | +| Qwen/Qwen2.5-VL-72B-Instruct | Qwen2VLForConditionalGeneration | ✅ | ✅ | | +| Qwen/Qwen2.5-VL-32B-Instruct | Qwen2VLForConditionalGeneration | ✅ | ✅ | | +| THUDM/GLM-4v-9B | GLM4vForConditionalGeneration | ✅ | ✅ | | +| openbmb/MiniCPM-V-4 | MiniCPMVForConditionalGeneration | ✅ | ✅ | | + +### Embedding and Reranker Language Models + +| Model | Architecture | FP16 | Dynamic FP8 | MXFP4 | +| ----------------------- | ------------------------------ | ---- | ----------- | ----- | +| Qwen/Qwen3-Embedding-8B | Qwen3ForTextEmbedding | ✅ | ✅ | | +| Qwen/Qwen3-Reranker-8B | Qwen3ForSequenceClassification | ✅ | ✅ | | + +✅ Runs and optimized. +🟨 Runs and correct but not optimized to green yet. +❌ Does not pass accuracy test or does not run. From 0aeb698b774e2d8593b14988e3af9ebbdd773730 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 26 Nov 2025 19:47:17 -0800 Subject: [PATCH 017/197] [Model Runner V2] Minor code cleanup (#29570) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu/cudagraph_utils.py | 11 ++--------- vllm/v1/worker/gpu/dp_utils.py | 9 +++++++++ vllm/v1/worker/gpu/model_runner.py | 16 +++++++--------- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py index ba783e2d0c6fb..6b056641c903d 100644 --- a/vllm/v1/worker/gpu/cudagraph_utils.py +++ b/vllm/v1/worker/gpu/cudagraph_utils.py @@ -16,6 +16,7 @@ from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.worker.gpu.attn_utils import build_attn_metadata from vllm.v1.worker.gpu.block_table import BlockTables +from vllm.v1.worker.gpu.dp_utils import make_num_tokens_across_dp from vllm.v1.worker.gpu.input_batch import InputBuffers @@ -127,15 +128,7 @@ class CudaGraphManager: slot_mappings=slot_mappings, kv_cache_config=kv_cache_config, ) - if self.dp_size > 1: - num_tokens_across_dp = torch.full( - (self.dp_size,), - batch_size, - dtype=torch.int32, - device="cpu", - ) - else: - num_tokens_across_dp = None + num_tokens_across_dp = make_num_tokens_across_dp(self.dp_size, batch_size) # Warm up. with set_forward_context( diff --git a/vllm/v1/worker/gpu/dp_utils.py b/vllm/v1/worker/gpu/dp_utils.py index 9bfc7f25bef3a..d71d91d1e5cb8 100644 --- a/vllm/v1/worker/gpu/dp_utils.py +++ b/vllm/v1/worker/gpu/dp_utils.py @@ -20,3 +20,12 @@ def get_batch_metadata_across_dp( tensor[1][dp_rank] = cudagraph_size dist.all_reduce(tensor, group=group) return tensor[0], tensor[1] + + +def make_num_tokens_across_dp( + dp_size: int, + num_tokens: int, +) -> torch.Tensor | None: + if dp_size == 1: + return None + return torch.full((dp_size,), num_tokens, dtype=torch.int32, device="cpu") diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py index e34a45f979807..6a78776b0a8a3 100644 --- a/vllm/v1/worker/gpu/model_runner.py +++ b/vllm/v1/worker/gpu/model_runner.py @@ -35,7 +35,10 @@ from vllm.v1.worker.gpu.attn_utils import ( ) from vllm.v1.worker.gpu.block_table import BlockTables from vllm.v1.worker.gpu.cudagraph_utils import CudaGraphManager -from vllm.v1.worker.gpu.dp_utils import get_batch_metadata_across_dp +from vllm.v1.worker.gpu.dp_utils import ( + get_batch_metadata_across_dp, + make_num_tokens_across_dp, +) from vllm.v1.worker.gpu.input_batch import ( InputBatch, InputBuffers, @@ -255,12 +258,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): if not skip_attn: self.prepare_dummy_attn_metadata(input_batch) - if self.dp_size == 1: - num_tokens_across_dp: torch.Tensor | None = None - else: - num_tokens_across_dp = torch.full( - (self.dp_size,), num_tokens, dtype=torch.int32, device="cpu" - ) + num_tokens_across_dp = make_num_tokens_across_dp(self.dp_size, num_tokens) num_sampled_tokens = np.ones(input_batch.num_reqs, dtype=np.int32) with ( self.maybe_dummy_run_with_lora( @@ -816,7 +814,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.req_states.last_sampled_tokens, next_prefill_tokens, ) - self.req_states.draft_tokens[input_batch.idx_mapping] = draft_tokens return draft_tokens def get_cudagraph_and_dp_padding( @@ -1006,7 +1003,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): input_batch, sampler_output.sampled_token_ids, num_sampled, num_rejected ) if self.do_spec_decode: - _ = self.propose_draft( + draft_tokens = self.propose_draft( input_batch, sampling_metadata, hidden_states, @@ -1014,6 +1011,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): num_sampled, num_rejected, ) + self.req_states.draft_tokens[input_batch.idx_mapping] = draft_tokens if self.use_async_scheduling: return async_output From ee80aee1cab6f0b6893cf54c8aa2f2c23512ec82 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 26 Nov 2025 20:10:12 -0800 Subject: [PATCH 018/197] [Model Runner V2] Minor cleanup for build_attn_metadata (#29576) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu/attn_utils.py | 8 +++----- vllm/v1/worker/gpu/cudagraph_utils.py | 3 ++- vllm/v1/worker/gpu/model_runner.py | 10 ++++++++-- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/vllm/v1/worker/gpu/attn_utils.py b/vllm/v1/worker/gpu/attn_utils.py index 4510a1c5ca1e9..5aa1a33d851cc 100644 --- a/vllm/v1/worker/gpu/attn_utils.py +++ b/vllm/v1/worker/gpu/attn_utils.py @@ -18,7 +18,6 @@ from vllm.v1.kv_cache_interface import ( KVCacheConfig, KVCacheSpec, ) -from vllm.v1.utils import CpuGpuBuffer from vllm.v1.worker.utils import bind_kv_cache @@ -145,7 +144,8 @@ def build_attn_metadata( attn_metadata_builders: list[AttentionMetadataBuilder], num_reqs: int, num_tokens: int, - query_start_loc: CpuGpuBuffer, + query_start_loc_gpu: torch.Tensor, + query_start_loc_cpu: torch.Tensor, seq_lens: torch.Tensor, seq_lens_np: np.ndarray, num_computed_tokens_cpu: torch.Tensor | None, @@ -153,9 +153,7 @@ def build_attn_metadata( slot_mappings: torch.Tensor, kv_cache_config: KVCacheConfig, ) -> dict[str, Any]: - query_start_loc_gpu = query_start_loc.gpu[: num_reqs + 1] - query_start_loc_cpu = query_start_loc.cpu[: num_reqs + 1] - max_query_len = int(query_start_loc.np[: num_reqs + 1].max()) + max_query_len = int(query_start_loc_cpu.max()) seq_lens = seq_lens[:num_reqs] seq_lens_cpu = torch.from_numpy(seq_lens_np) max_seq_len = int(seq_lens_np.max()) diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py index 6b056641c903d..b5fc2edea130f 100644 --- a/vllm/v1/worker/gpu/cudagraph_utils.py +++ b/vllm/v1/worker/gpu/cudagraph_utils.py @@ -120,7 +120,8 @@ class CudaGraphManager: attn_metadata_builders=attn_metadata_builders, num_reqs=batch_size, num_tokens=batch_size, - query_start_loc=input_buffers.query_start_loc, + query_start_loc_gpu=input_buffers.query_start_loc.gpu[: batch_size + 1], + query_start_loc_cpu=input_buffers.query_start_loc.cpu[: batch_size + 1], seq_lens=input_buffers.seq_lens, seq_lens_np=np.full(batch_size, self.max_model_len, dtype=np.int32), num_computed_tokens_cpu=None, # FIXME diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py index 6a78776b0a8a3..ed41e5a1a6c5e 100644 --- a/vllm/v1/worker/gpu/model_runner.py +++ b/vllm/v1/worker/gpu/model_runner.py @@ -226,11 +226,15 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): num_computed_tokens = torch.zeros( input_batch.num_reqs, dtype=torch.int32, device=self.device ) + query_start_loc = self.input_buffers.query_start_loc + query_start_loc_gpu = query_start_loc.gpu[: input_batch.num_reqs + 1] + query_start_loc_cpu = query_start_loc.cpu[: input_batch.num_reqs + 1] attn_metadata = build_attn_metadata( attn_metadata_builders=self.attn_metadata_builders, num_reqs=input_batch.num_reqs, num_tokens=input_batch.num_tokens, - query_start_loc=self.input_buffers.query_start_loc, + query_start_loc_gpu=query_start_loc_gpu, + query_start_loc_cpu=query_start_loc_cpu, seq_lens=self.input_buffers.seq_lens, seq_lens_np=input_batch.seq_lens_np, num_computed_tokens_cpu=num_computed_tokens, @@ -515,6 +519,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.input_buffers.query_start_loc.np[num_reqs + 1 :] = num_tokens self.input_buffers.query_start_loc.copy_to_gpu() query_start_loc_gpu = self.input_buffers.query_start_loc.gpu[: num_reqs + 1] + query_start_loc_cpu = self.input_buffers.query_start_loc.cpu[: num_reqs + 1] query_start_loc_np = self.input_buffers.query_start_loc.np[: num_reqs + 1] # Copy prefill tokens from CPU to GPU. @@ -572,7 +577,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): attn_metadata_builders=self.attn_metadata_builders, num_reqs=num_reqs, num_tokens=num_tokens, - query_start_loc=self.input_buffers.query_start_loc, + query_start_loc_gpu=query_start_loc_gpu, + query_start_loc_cpu=query_start_loc_cpu, seq_lens=self.input_buffers.seq_lens, seq_lens_np=seq_lens_np, num_computed_tokens_cpu=num_computed_tokens, From da8e1a1bf9b0f6d3b18608d8f99a456cd8833a0f Mon Sep 17 00:00:00 2001 From: TJian Date: Thu, 27 Nov 2025 12:42:50 +0800 Subject: [PATCH 019/197] [DOC] Add vLLM Bangkok Meetup info (#29561) Signed-off-by: tjtanaa --- README.md | 1 + docs/community/meetups.md | 1 + 2 files changed, 2 insertions(+) diff --git a/README.md b/README.md index 033e1035d8916..abbb63158f166 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,7 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio *Latest News* 🔥 +- [2025/11] We hosted [vLLM Bangkok Meetup](https://luma.com/v0f647nv). We explored vLLM and LMCache inference and low-resource language adaptation with speakers from Embedded LLM, AMD, and Red Hat. Please find the meetup slides [here](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing). - [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI) - [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link). - [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6). diff --git a/docs/community/meetups.md b/docs/community/meetups.md index 0735f452df960..d8cf4ecdd5a32 100644 --- a/docs/community/meetups.md +++ b/docs/community/meetups.md @@ -10,6 +10,7 @@ Stay tuned for upcoming meetups! Follow us on [Twitter/X](https://x.com/vllm_pro Below you'll find slides and recordings from our previous meetups: +- [vLLM Bangkok Meetup](https://luma.com/v0f647nv), November 21st 2025. [[Slides]](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing) - [vLLM Zurich Meetup](https://luma.com/0gls27kb), November 6th 2025. [[Slides]](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) [[Recording]](https://www.youtube.com/watch?v=6m6ZE6yVEDI) - [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w), November 1st 2025. [[Slides]](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link) - [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg), October 25th 2025. [[Slides]](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6) From ecb1952378dda5c7a3a8d89cc6c2f1d806248b9f Mon Sep 17 00:00:00 2001 From: Fadi Arafeh <115173828+fadara01@users.noreply.github.com> Date: Thu, 27 Nov 2025 05:09:41 +0000 Subject: [PATCH 020/197] [cpu][fix] Fix Arm CI tests (#29552) Signed-off-by: Fadi Arafeh --- .../scripts/hardware_ci/run-cpu-test-arm.sh | 26 +++++++++---------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh index d0036f24c8d04..b5f6b2494792f 100755 --- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh @@ -7,53 +7,51 @@ set -ex # allow to bind to different cores CORE_RANGE=${CORE_RANGE:-0-16} OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16} -NUMA_NODE=${NUMA_NODE:-0} -export CMAKE_BUILD_PARALLEL_LEVEL=32 +export CMAKE_BUILD_PARALLEL_LEVEL=16 # Setup cleanup remove_docker_container() { set -e; - docker rm -f cpu-test-"$NUMA_NODE" || true; + docker rm -f cpu-test || true; } trap remove_docker_container EXIT remove_docker_container # Try building the docker image -numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu . +docker build --tag cpu-test --target vllm-test -f docker/Dockerfile.cpu . -# Run the image, setting --shm-size=4g for tensor parallel. -docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" +# Run the image +docker run -itd --cpuset-cpus="$CORE_RANGE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test cpu-test function cpu_tests() { set -e - export NUMA_NODE=$2 - docker exec cpu-test-"$NUMA_NODE" bash -c " + docker exec cpu-test bash -c " set -e pip list" # offline inference - docker exec cpu-test-"$NUMA_NODE" bash -c " + docker exec cpu-test bash -c " set -e python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" # Run kernel tests - docker exec cpu-test-"$NUMA_NODE" bash -c " + docker exec cpu-test bash -c " set -e pytest -x -v -s tests/kernels/test_onednn.py pytest -x -v -s tests/kernels/attention/test_cpu_attn.py" # basic online serving - docker exec cpu-test-"$NUMA_NODE" bash -c ' + docker exec cpu-test bash -c ' set -e - VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve meta-llama/Llama-3.2-3B-Instruct --max-model-len 2048 & + VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve Qwen/Qwen3-0.6B --max-model-len 2048 & server_pid=$! timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1 vllm bench serve \ --backend vllm \ --dataset-name random \ - --model meta-llama/Llama-3.2-3B-Instruct \ + --model Qwen/Qwen3-0.6B \ --num-prompts 20 \ --endpoint /v1/completions kill -s SIGTERM $server_pid &' @@ -61,4 +59,4 @@ function cpu_tests() { # All of CPU tests are expected to be finished less than 40 mins. export -f cpu_tests -timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" +timeout 2h bash -c cpu_tests From 11ea5ec1ff7afc5ba181cba41f0cc2e4053e27f3 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 26 Nov 2025 21:37:59 -0800 Subject: [PATCH 021/197] [Model Runner V2] Refactor CudaGraphManager (#29583) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu/cudagraph_utils.py | 247 ++++++++++++++++---------- 1 file changed, 153 insertions(+), 94 deletions(-) diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py index b5fc2edea130f..8f1718e493b1e 100644 --- a/vllm/v1/worker/gpu/cudagraph_utils.py +++ b/vllm/v1/worker/gpu/cudagraph_utils.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from unittest.mock import patch +from collections.abc import Callable, Iterable +from typing import Any import numpy as np import torch @@ -32,6 +33,7 @@ class CudaGraphManager: self.max_model_len = vllm_config.model_config.max_model_len self.max_num_reqs = self.scheduler_config.max_num_seqs + self.max_num_tokens = self.scheduler_config.max_num_batched_tokens self.dp_size = vllm_config.parallel_config.data_parallel_size self.compilation_config = vllm_config.compilation_config assert self.compilation_config is not None @@ -40,102 +42,60 @@ class CudaGraphManager: self.cudagraph_mode = CUDAGraphMode.NONE else: self.cudagraph_mode = self.compilation_config.cudagraph_mode - if self.compilation_config.cudagraph_capture_sizes is not None: - cudagraph_sizes = sorted(self.compilation_config.cudagraph_capture_sizes) - # Limit the cudagraph sizes to the max decode batch size. - self.cudagraph_sizes = [ - x for x in cudagraph_sizes if x <= self.max_num_reqs - ] - else: - self.cudagraph_sizes = [] - self.padded_sizes = self._init_padded_sizes() + self.cudagraph_sizes = get_cudagraph_sizes( + self.compilation_config.cudagraph_capture_sizes, + self.max_num_reqs, + self.max_num_tokens, + self.cudagraph_mode, + ) self.graphs: dict[int, torch.cuda.CUDAGraph] = {} self.pool = torch.cuda.graph_pool_handle() self.hidden_states: torch.Tensor | None = None - def _init_padded_sizes(self) -> dict[int, int]: - if not self.cudagraph_mode.has_full_cudagraphs(): - # Full cuda graphs are not used. - return {} - if not self.cudagraph_sizes: - return {} - - padded_sizes: dict[int, int] = {} - for i in range(1, self.cudagraph_sizes[-1] + 1): - for x in self.cudagraph_sizes: - if i <= x: - padded_sizes[i] = x - break - return padded_sizes - def needs_capture(self) -> bool: - return len(self.padded_sizes) > 0 + return len(self.cudagraph_sizes) > 0 def get_cudagraph_size( self, scheduler_output: SchedulerOutput, num_tokens_after_padding: int, ) -> int | None: - if not self.cudagraph_mode.has_full_cudagraphs(): - return None - if self.cudagraph_mode != CUDAGraphMode.FULL: - # TODO(woosuk): Support uniform decode with multiple tokens (spec decoding). - all_decode = all( - x == 1 for x in scheduler_output.num_scheduled_tokens.values() - ) - if not all_decode: - # Prefill is included. - return None - return self.padded_sizes.get(num_tokens_after_padding) + return get_cudagraph_size( + num_tokens_after_padding, + scheduler_output.num_scheduled_tokens.values(), + self.cudagraph_sizes, + self.cudagraph_mode, + ) def capture_graph( self, - batch_size: int, + num_tokens: int, model: nn.Module, input_buffers: InputBuffers, block_tables: BlockTables, attn_metadata_builders: list[AttentionMetadataBuilder], kv_cache_config: KVCacheConfig, ) -> None: - assert batch_size not in self.graphs - - # Prepare dummy inputs. - input_ids = input_buffers.input_ids.gpu[:batch_size] - positions = input_buffers.positions[:batch_size] - - input_buffers.query_start_loc.np[: batch_size + 1] = np.arange(batch_size + 1) - input_buffers.query_start_loc.np[batch_size:] = batch_size - input_buffers.query_start_loc.copy_to_gpu() - # HACK(woosuk): To optimize warmup time, we use 1 (instead of max_model_len) - # for seq_lens. This leads to a mismatch between seq_lens (GPU) and - # seq_lens_np (CPU), which might cause issues in some attention backends. - input_buffers.seq_lens[:batch_size] = 1 - input_buffers.seq_lens[batch_size:] = 0 - - input_block_tables = [x[:batch_size] for x in block_tables.input_block_tables] - slot_mappings = block_tables.slot_mappings[:, :batch_size] - - attn_metadata = build_attn_metadata( - attn_metadata_builders=attn_metadata_builders, - num_reqs=batch_size, - num_tokens=batch_size, - query_start_loc_gpu=input_buffers.query_start_loc.gpu[: batch_size + 1], - query_start_loc_cpu=input_buffers.query_start_loc.cpu[: batch_size + 1], - seq_lens=input_buffers.seq_lens, - seq_lens_np=np.full(batch_size, self.max_model_len, dtype=np.int32), - num_computed_tokens_cpu=None, # FIXME - block_tables=input_block_tables, - slot_mappings=slot_mappings, - kv_cache_config=kv_cache_config, + num_reqs = min(num_tokens, self.max_num_reqs) + input_ids = input_buffers.input_ids.gpu[:num_tokens] + positions = input_buffers.positions[:num_tokens] + attn_metadata = prepare_inputs_to_capture( + num_reqs, + num_tokens, + input_buffers, + block_tables, + attn_metadata_builders, + self.max_model_len, + kv_cache_config, ) - num_tokens_across_dp = make_num_tokens_across_dp(self.dp_size, batch_size) + num_tokens_across_dp = make_num_tokens_across_dp(self.dp_size, num_tokens) # Warm up. with set_forward_context( attn_metadata, self.vllm_config, - num_tokens=batch_size, + num_tokens=num_tokens, cudagraph_runtime_mode=CUDAGraphMode.NONE, num_tokens_across_dp=num_tokens_across_dp, ): @@ -147,13 +107,13 @@ class CudaGraphManager: self.hidden_states = torch.empty_like(hidden_states) # Capture the graph. + assert num_tokens not in self.graphs graph = torch.cuda.CUDAGraph() with ( - patch("torch.cuda.empty_cache", lambda: None), set_forward_context( attn_metadata, self.vllm_config, - num_tokens=batch_size, + num_tokens=num_tokens, cudagraph_runtime_mode=CUDAGraphMode.NONE, num_tokens_across_dp=num_tokens_across_dp, ), @@ -163,8 +123,8 @@ class CudaGraphManager: input_ids=input_ids, positions=positions, ) - self.hidden_states[:batch_size] = hidden_states - self.graphs[batch_size] = graph + self.hidden_states[:num_tokens] = hidden_states + self.graphs[num_tokens] = graph @torch.inference_mode() def capture( @@ -175,25 +135,124 @@ class CudaGraphManager: attn_metadata_builders: list[AttentionMetadataBuilder], kv_cache_config: KVCacheConfig, ) -> None: - assert self.needs_capture() - # Capture larger graphs first. - sizes_to_capture = sorted(self.cudagraph_sizes, reverse=True) - if is_global_first_rank(): - sizes_to_capture = tqdm(sizes_to_capture, desc="Capturing CUDA graphs") + capture_graphs( + self.cudagraph_sizes, + self.device, + self.capture_graph, + model=model, + input_buffers=input_buffers, + block_tables=block_tables, + attn_metadata_builders=attn_metadata_builders, + kv_cache_config=kv_cache_config, + ) - with graph_capture(device=self.device): - for batch_size in sizes_to_capture: - self.capture_graph( - batch_size, - model, - input_buffers, - block_tables, - attn_metadata_builders, - kv_cache_config, - ) - - def run(self, batch_size: int) -> torch.Tensor: - assert batch_size in self.graphs - self.graphs[batch_size].replay() + def run(self, num_tokens: int) -> torch.Tensor: + assert num_tokens in self.graphs + self.graphs[num_tokens].replay() assert self.hidden_states is not None - return self.hidden_states[:batch_size] + return self.hidden_states[:num_tokens] + + +def get_cudagraph_sizes( + capture_sizes: list[int] | None, + max_num_reqs: int, + max_num_tokens: int, + cudagraph_mode: CUDAGraphMode, +) -> dict[int, int]: + if not cudagraph_mode.has_full_cudagraphs(): + return {} + if not capture_sizes: + return {} + + capture_sizes = sorted(capture_sizes) + # Limit the capture sizes to the max number of requests or tokens. + upper_bound = ( + max_num_reqs + if cudagraph_mode == CUDAGraphMode.FULL_DECODE_ONLY + else max_num_tokens + ) + capture_sizes = [x for x in capture_sizes if x <= upper_bound] + if not capture_sizes: + return {} + + cudagraph_sizes: dict[int, int] = {} + for i in range(1, capture_sizes[-1] + 1): + for x in capture_sizes: + if i <= x: + cudagraph_sizes[i] = x + break + return cudagraph_sizes + + +def get_cudagraph_size( + num_tokens_after_dp_padding: int, + num_tokens_per_request: Iterable[int], + cudagraph_sizes: dict[int, int], + cudagraph_mode: CUDAGraphMode, +) -> int | None: + size = cudagraph_sizes.get(num_tokens_after_dp_padding) + if size is None: + # No CUDA graph for this size. + return None + if cudagraph_mode == CUDAGraphMode.FULL_DECODE_ONLY: + all_decode = all(x == 1 for x in num_tokens_per_request) + if not all_decode: + # Prefill is included. + return None + return size + + +def capture_graphs( + cudagraph_sizes: dict[int, int], + device: torch.device, + capture_fn: Callable, + **capture_kwargs, +) -> None: + # Capture larger graphs first. + sizes_to_capture = sorted(set(cudagraph_sizes.values()), reverse=True) + if is_global_first_rank(): + sizes_to_capture = tqdm(sizes_to_capture, desc="Capturing CUDA graphs") + + with graph_capture(device=device): + for size in sizes_to_capture: + capture_fn(size, **capture_kwargs) + + +def prepare_inputs_to_capture( + num_reqs: int, + num_tokens: int, + input_buffers: InputBuffers, + block_tables: BlockTables, + attn_metadata_builders: list[AttentionMetadataBuilder], + max_model_len: int, + kv_cache_config: KVCacheConfig, +) -> dict[str, Any]: + num_tokens_per_req = num_tokens // num_reqs + query_start_loc = input_buffers.query_start_loc + query_start_loc.np[: num_reqs + 1] = np.arange(num_reqs + 1) * num_tokens_per_req + query_start_loc.np[num_reqs:] = num_tokens + query_start_loc.copy_to_gpu() + seq_lens_np = np.full(num_reqs, max_model_len, dtype=np.int32) + # HACK(woosuk): To optimize warmup time, we use 1 (instead of max_model_len) + # for seq_lens. This leads to a mismatch between seq_lens (GPU) and + # seq_lens_np (CPU), which might cause issues in some attention backends. + input_buffers.seq_lens[:num_reqs] = 1 + input_buffers.seq_lens[num_reqs:] = 0 + + input_block_tables = [x[:num_reqs] for x in block_tables.input_block_tables] + slot_mappings = block_tables.slot_mappings[:, :num_tokens] + + attn_metadata = build_attn_metadata( + attn_metadata_builders=attn_metadata_builders, + num_reqs=num_reqs, + num_tokens=num_tokens, + query_start_loc_gpu=query_start_loc.gpu[: num_reqs + 1], + query_start_loc_cpu=query_start_loc.cpu[: num_reqs + 1], + seq_lens=input_buffers.seq_lens, + seq_lens_np=seq_lens_np, + num_computed_tokens_cpu=None, # FIXME + block_tables=input_block_tables, + slot_mappings=slot_mappings, + kv_cache_config=kv_cache_config, + ) + return attn_metadata From c069086b9c9b8212b0a8544eb25b6af65c16762d Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Thu, 27 Nov 2025 15:16:07 +0800 Subject: [PATCH 022/197] [Bugfix] Fix getting device for MoE LoRA (#29475) Signed-off-by: Jee Jee Li --- vllm/lora/layers/fused_moe.py | 4 +++- vllm/lora/layers/utils.py | 9 +++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index 0eb6562bec6cd..1b925742c3002 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -30,6 +30,8 @@ from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import ( FusedMoEModularMethod, ) +from .utils import _get_lora_device + class FusedMoEWithLoRA(BaseLayerWithLoRA): def __init__(self, base_layer: FusedMoE) -> None: @@ -41,7 +43,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): ) self.tp_size = get_tensor_model_parallel_world_size() self.tp_rank = get_tensor_model_parallel_rank() - self.device = base_layer.w2_weight.device + self.device = _get_lora_device(base_layer) self._w13_slices = 2 self._inject_lora_into_fused_moe() diff --git a/vllm/lora/layers/utils.py b/vllm/lora/layers/utils.py index 2da90f180ee74..74403240f6cc2 100644 --- a/vllm/lora/layers/utils.py +++ b/vllm/lora/layers/utils.py @@ -33,6 +33,15 @@ def _get_lora_device(base_layer: nn.Module) -> torch.device: # HQQ marlin elif hasattr(base_layer, "W_q"): return base_layer.W_q.device + # MoE layer + elif hasattr(base_layer, "w2_weight"): + return base_layer.w2_weight.device + # MoE Compressed Tensor + elif hasattr(base_layer, "w2_weight_packed"): + return base_layer.w2_weight_packed.device + # MoE GPTQ/AWQ/GGUF + elif hasattr(base_layer, "w2_qweight"): + return base_layer.w2_qweight.device else: raise ValueError(f"Unsupported base layer: {base_layer}") From 3ecabd06eee69e60c2239a6ca7159b978b26d6ce Mon Sep 17 00:00:00 2001 From: Johnny Yang <24908445+jcyang43@users.noreply.github.com> Date: Wed, 26 Nov 2025 23:25:21 -0800 Subject: [PATCH 023/197] Fix tpu-inference platform path (#29554) Signed-off-by: Johnny Yang --- vllm/platforms/tpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index aa5ddbe43659d..04325a522f444 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -267,7 +267,7 @@ class TpuPlatform(Platform): try: - from tpu_inference.platforms.tpu_platforms import ( + from tpu_inference.platforms import ( TpuPlatform as TpuInferencePlatform, ) From 43c5792592d9beb02eea57730ce5a4647dc0c838 Mon Sep 17 00:00:00 2001 From: Micah Williamson Date: Thu, 27 Nov 2025 01:54:44 -0600 Subject: [PATCH 024/197] [ROCm][CI] Fix test_cpu_offloading for ROCm (#29548) Signed-off-by: Micah Williamson --- tests/v1/kv_offload/test_cpu_offloading.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/v1/kv_offload/test_cpu_offloading.py b/tests/v1/kv_offload/test_cpu_offloading.py index 406d4c0b4c1fd..57474a3dc01e7 100644 --- a/tests/v1/kv_offload/test_cpu_offloading.py +++ b/tests/v1/kv_offload/test_cpu_offloading.py @@ -20,6 +20,8 @@ ATTN_BACKENDS = ["FLASH_ATTN"] if current_platform.is_cuda(): ATTN_BACKENDS.append("FLASHINFER") +elif current_platform.is_rocm(): + ATTN_BACKENDS = ["TRITON_ATTN"] class MockSubscriber: From da3222f371b48c8e2548ec22767523394580a1c5 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 27 Nov 2025 00:09:41 -0800 Subject: [PATCH 025/197] [Model Runner V2] Implement multi-step Eagle with CUDA graph (#29559) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu/cudagraph_utils.py | 9 +- vllm/v1/worker/gpu/model_runner.py | 53 +-- vllm/v1/worker/gpu/spec_decode/eagle.py | 422 ++++++++++++++++-- .../worker/gpu/spec_decode/eagle_cudagraph.py | 112 +++++ 4 files changed, 526 insertions(+), 70 deletions(-) create mode 100644 vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py index 8f1718e493b1e..4fd8eb50a4ea8 100644 --- a/vllm/v1/worker/gpu/cudagraph_utils.py +++ b/vllm/v1/worker/gpu/cudagraph_utils.py @@ -233,10 +233,11 @@ def prepare_inputs_to_capture( query_start_loc.np[num_reqs:] = num_tokens query_start_loc.copy_to_gpu() seq_lens_np = np.full(num_reqs, max_model_len, dtype=np.int32) - # HACK(woosuk): To optimize warmup time, we use 1 (instead of max_model_len) - # for seq_lens. This leads to a mismatch between seq_lens (GPU) and - # seq_lens_np (CPU), which might cause issues in some attention backends. - input_buffers.seq_lens[:num_reqs] = 1 + # HACK(woosuk): For faster warmup, we set seq_lens (GPU) to num_tokens + # rather than max_model_len. This introduces a discrepancy between + # seq_lens (on GPU) and seq_lens_np (on CPU), which may cause issues for + # certain attention backends. + input_buffers.seq_lens[:num_reqs] = num_tokens input_buffers.seq_lens[num_reqs:] = 0 input_block_tables = [x[:num_reqs] for x in block_tables.input_block_tables] diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py index ed41e5a1a6c5e..0c9fdd0077f4a 100644 --- a/vllm/v1/worker/gpu/model_runner.py +++ b/vllm/v1/worker/gpu/model_runner.py @@ -140,10 +140,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.sampler = Sampler(logprobs_mode=self.model_config.logprobs_mode) # CUDA graphs. - self.cudagraph_manager = CudaGraphManager( - vllm_config=self.vllm_config, - device=self.device, - ) + self.cudagraph_manager = CudaGraphManager(self.vllm_config, self.device) def get_supported_tasks(self) -> tuple[str]: return ("generate",) @@ -203,6 +200,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.vllm_config, self.device, ) + if self.do_spec_decode: + # HACK(woosuk) + self.speculator.set_attn( + self.kv_cache_config, + self.attn_metadata_builders, + self.block_tables, + ) + # TODO(woosuk): Support other backends. if not all(b.get_name() == "FLASH_ATTN" for b in self.attn_backends.values()): raise NotImplementedError("Only FLASH_ATTN backend is supported currently.") @@ -297,35 +302,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): logits = self.model.compute_logits(hidden_states) self.sampler(logits, sampling_metadata) - @torch.inference_mode() - def _dummy_speculator_run( - self, - hidden_states: torch.Tensor, - aux_hidden_states: list[torch.Tensor] | None, - ) -> None: - num_tokens = hidden_states.shape[0] - num_reqs = min(num_tokens, self.max_num_reqs) - input_batch = InputBatch.make_dummy( - num_reqs=num_reqs, - num_tokens=num_tokens, - input_buffers=self.input_buffers, - device=self.device, - ) - sampling_metadata = SamplingMetadata.make_dummy( - num_reqs=num_reqs, - device=self.device, - ) - num_sampled = torch.ones(num_reqs, dtype=torch.int32, device=self.device) - num_rejected = torch.zeros(num_reqs, dtype=torch.int32, device=self.device) - self.propose_draft( - input_batch=input_batch, - sampling_metadata=sampling_metadata, - last_hidden_states=hidden_states, - aux_hidden_states=aux_hidden_states, - num_sampled=num_sampled, - num_rejected=num_rejected, - ) - @torch.inference_mode() def profile_run(self) -> None: hidden_states, sample_hidden_states = self._dummy_run( @@ -334,7 +310,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ) self._dummy_sampler_run(sample_hidden_states) if self.do_spec_decode: - self._dummy_speculator_run(hidden_states, None) + num_tokens_across_dp = make_num_tokens_across_dp( + self.dp_size, self.max_num_tokens + ) + self.speculator.run_model( + self.max_num_tokens, + attn_metadata=None, + num_tokens_across_dp=num_tokens_across_dp, + ) torch.cuda.synchronize() del hidden_states, sample_hidden_states gc.collect() @@ -368,6 +351,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): attn_metadata_builders=self.attn_metadata_builders, kv_cache_config=self.kv_cache_config, ) + if self.do_spec_decode: + self.speculator.capture_model() end_time = time.perf_counter() end_free_gpu_memory = torch.cuda.mem_get_info()[0] diff --git a/vllm/v1/worker/gpu/spec_decode/eagle.py b/vllm/v1/worker/gpu/spec_decode/eagle.py index 3c8621cc69c97..daf2775e8b92d 100644 --- a/vllm/v1/worker/gpu/spec_decode/eagle.py +++ b/vllm/v1/worker/gpu/spec_decode/eagle.py @@ -1,17 +1,29 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any + +import numpy as np import torch import torch.nn as nn from vllm.config import VllmConfig from vllm.config.compilation import CUDAGraphMode from vllm.forward_context import set_forward_context +from vllm.logger import init_logger from vllm.model_executor.model_loader import get_model from vllm.triton_utils import tl, triton -from vllm.v1.worker.gpu.input_batch import InputBatch +from vllm.utils.platform_utils import is_pin_memory_available +from vllm.v1.attention.backends.utils import AttentionMetadataBuilder +from vllm.v1.kv_cache_interface import KVCacheConfig +from vllm.v1.worker.gpu.attn_utils import build_attn_metadata +from vllm.v1.worker.gpu.block_table import BlockTables +from vllm.v1.worker.gpu.input_batch import InputBatch, InputBuffers from vllm.v1.worker.gpu.sampler import gumbel_sample +from vllm.v1.worker.gpu.spec_decode.eagle_cudagraph import EagleCudaGraphManager from vllm.v1.worker.gpu.states import SamplingMetadata +logger = init_logger(__name__) + class EagleSpeculator: def __init__(self, vllm_config: VllmConfig, device: torch.device): @@ -27,13 +39,48 @@ class EagleSpeculator: self.scheduler_config = vllm_config.scheduler_config self.max_num_reqs = self.scheduler_config.max_num_seqs self.max_num_tokens = self.scheduler_config.max_num_batched_tokens + self.max_model_len = vllm_config.model_config.max_model_len + # We need to get the hidden size from the draft model config because + # the draft model's hidden size can be different from the target model's + # hidden size (e.g., Llama 3.3 70B). + self.hidden_size = self.draft_model_config.get_hidden_size() + self.vocab_size = self.draft_model_config.get_vocab_size() + self.pin_memory = is_pin_memory_available() + self.dtype = vllm_config.model_config.dtype - self.input_ids = torch.zeros( - self.max_num_tokens, dtype=torch.int32, device=device + self.input_buffers = InputBuffers( + max_num_reqs=self.max_num_reqs, + max_num_tokens=self.max_num_tokens, + hidden_size=self.hidden_size, + vocab_size=self.vocab_size, + dtype=self.dtype, + device=device, + pin_memory=self.pin_memory, ) - self.positions = torch.zeros( - self.max_num_tokens, dtype=torch.int64, device=device + self.hidden_states = torch.zeros( + self.max_num_tokens, + self.hidden_size, + dtype=self.dtype, + device=device, ) + self.temperature = torch.zeros( + self.max_num_reqs, + dtype=torch.float32, + device=device, + ) + self.seeds = torch.zeros( + self.max_num_reqs, + dtype=torch.int64, + device=device, + ) + self.draft_tokens = torch.zeros( + self.max_num_reqs, + self.num_speculative_steps, + dtype=torch.int64, + device=device, + ) + + self.cudagraph_manager = EagleCudaGraphManager(vllm_config, device) def load_model(self, target_model: nn.Module) -> None: from vllm.compilation.backends import set_model_tag @@ -49,6 +96,91 @@ class EagleSpeculator: del self.model.lm_head self.model.lm_head = target_model.lm_head + def set_attn( + self, + kv_cache_config: KVCacheConfig, + attn_metadata_builders: list[AttentionMetadataBuilder], + block_tables: BlockTables, + ) -> None: + self.kv_cache_config = kv_cache_config + self.attn_metadata_builders = attn_metadata_builders + self.block_tables = block_tables + + @torch.inference_mode() + def run_model( + self, + num_tokens: int, + attn_metadata: dict[str, Any], + num_tokens_across_dp: torch.Tensor | None, + ) -> tuple[torch.Tensor, torch.Tensor]: + with set_forward_context( + attn_metadata, + self.vllm_config, + num_tokens=num_tokens, + cudagraph_runtime_mode=CUDAGraphMode.NONE, + num_tokens_across_dp=num_tokens_across_dp, + ): + ret_hidden_states = self.model( + input_ids=self.input_buffers.input_ids.gpu[:num_tokens], + positions=self.input_buffers.positions[:num_tokens], + hidden_states=self.hidden_states[:num_tokens], + ) + if self.method == "mtp": + last_hidden_states = ret_hidden_states + hidden_states = ret_hidden_states + else: + last_hidden_states, hidden_states = ret_hidden_states + return last_hidden_states, hidden_states + + def generate_draft( + self, + num_reqs: int, + attn_metadata: dict[str, Any], + num_tokens_across_dp: torch.Tensor | None, + ) -> None: + pos = self.input_buffers.positions[:num_reqs] + query_start_loc = self.input_buffers.query_start_loc.gpu[: num_reqs + 1] + for step in range(1, self.num_speculative_steps): + # Run the eagle model. + last_hidden_states, hidden_states = self.run_model( + num_reqs, attn_metadata, num_tokens_across_dp + ) + logits = self.model.compute_logits(last_hidden_states) + + # NOTE(woosuk): We must add 1 to the positions to match the Gumbel noise + # used for draft and target sampling. + draft_tokens = gumbel_sample( + logits, + self.temperature[:num_reqs], + self.seeds[:num_reqs], + pos + 1, + apply_temperature=True, + ) + self.draft_tokens[:num_reqs, step] = draft_tokens + + if step < self.num_speculative_steps - 1: + # Update the inputs for the next step. + update_eagle_inputs( + draft_tokens, + hidden_states, + self.input_buffers, + self.hidden_states, + self.max_model_len, + ) + self.block_tables.compute_slot_mappings(query_start_loc, pos) + + def capture_model(self) -> None: + if self.num_speculative_steps == 1: + return + logger.info("Capturing model for Eagle speculator...") + self.cudagraph_manager.capture( + self.generate_draft, + self.input_buffers, + self.block_tables, + self.attn_metadata_builders, + self.kv_cache_config, + ) + @torch.inference_mode() def propose( self, @@ -80,64 +212,110 @@ class EagleSpeculator: ) else: hidden_states = last_hidden_states + num_tokens = input_batch.num_tokens_after_padding + self.hidden_states[:num_tokens] = hidden_states # Get the input ids and last token indices for the speculator. last_token_indices = prepare_eagle_inputs( - self.input_ids, + self.input_buffers, input_batch, num_sampled, num_rejected, last_sampled, next_prefill_tokens, ) - input_ids = self.input_ids[: input_batch.num_tokens_after_padding] # Prefill: Run the eagle speculator with eager mode. - with set_forward_context( + # TODO(woosuk): Support CUDA graph for prefill. + last_hidden_states, hidden_states = self.run_model( + num_tokens, input_batch.attn_metadata, - self.vllm_config, - num_tokens=input_batch.num_tokens_after_padding, - cudagraph_runtime_mode=CUDAGraphMode.NONE, - ): - ret_hidden_states = self.model( - input_ids=input_ids, - positions=input_batch.positions, - hidden_states=hidden_states, - ) - if self.method == "mtp": - last_hidden_states = ret_hidden_states - hidden_states = ret_hidden_states - else: - last_hidden_states, hidden_states = ret_hidden_states + num_tokens_across_dp=None, # FIXME + ) sample_hidden_states = last_hidden_states[last_token_indices] logits = self.model.compute_logits(sample_hidden_states) num_reqs = input_batch.num_reqs cu_num_logits = input_batch.cu_num_logits[:num_reqs] - temperature = sampling_metadata.temperature[cu_num_logits] - seed = sampling_metadata.seeds[cu_num_logits] - # NOTE(woosuk): We must add 1 to the positions to match the Gumbel noise - # used for draft and target sampling. - pos = input_batch.positions[last_token_indices] + 1 # NOTE(woosuk): For draft sampling, we only consider the temperature # and ignore the other sampling parameters such as top_k and top_p, # for simplicity and performance. # While this may slightly degrade the acceptance rate, it does not # affect the output distribution after rejection sampling. + temperature = self.temperature[:num_reqs] + seeds = self.seeds[:num_reqs] + pos = self.input_buffers.positions[:num_reqs] + # Gather the values and copy them to the pre-allocated buffers. + torch.gather(sampling_metadata.temperature, 0, cu_num_logits, out=temperature) + torch.gather(sampling_metadata.seeds, 0, cu_num_logits, out=seeds) + torch.gather(input_batch.positions, 0, last_token_indices, out=pos) + # NOTE(woosuk): We must add 1 to the positions to match the Gumbel noise + # used for draft and target sampling. draft_tokens = gumbel_sample( - logits, temperature, seed, pos, apply_temperature=True + logits, temperature, seeds, pos + 1, apply_temperature=True ) if self.num_speculative_steps == 1: # Early exit. return draft_tokens.view(-1, 1) - raise NotImplementedError("num_speculative_steps > 1 is not supported yet.") + + # Save the draft tokens for the first step. + self.draft_tokens[:num_reqs, 0] = draft_tokens + # Prepare the inputs for the decode steps. + prepare_eagle_decode( + draft_tokens, + hidden_states, + last_token_indices, + input_batch.seq_lens, + num_rejected, + self.input_buffers, + self.hidden_states, + self.max_model_len, + self.max_num_reqs, + ) + query_start_loc = self.input_buffers.query_start_loc + query_start_loc_gpu = query_start_loc.gpu[: num_reqs + 1] + slot_mappings = self.block_tables.compute_slot_mappings( + query_start_loc_gpu, pos + ) + + cudagraph_size = self.cudagraph_manager.get_cudagraph_size(num_reqs) + if cudagraph_size is not None: + # Run CUDA graph. + self.cudagraph_manager.run(cudagraph_size) + return self.draft_tokens[:num_reqs] + + # Run eager mode. + query_start_loc.np[: num_reqs + 1] = np.arange(num_reqs + 1) + query_start_loc_cpu = query_start_loc.cpu[: num_reqs + 1] + # HACK(woosuk) + seq_lens_np = np.full(num_reqs, self.max_model_len, dtype=np.int32) + block_tables = [x[:num_reqs] for x in self.block_tables.input_block_tables] + + # FIXME(woosuk): This is UNSAFE!! + attn_metadata = build_attn_metadata( + attn_metadata_builders=self.attn_metadata_builders, + num_reqs=num_reqs, + num_tokens=num_reqs, + query_start_loc_gpu=query_start_loc_gpu, + query_start_loc_cpu=query_start_loc_cpu, + seq_lens=self.input_buffers.seq_lens[:num_reqs], + seq_lens_np=seq_lens_np, + num_computed_tokens_cpu=None, # FIXME + block_tables=block_tables, + slot_mappings=slot_mappings, + kv_cache_config=self.kv_cache_config, + ) + self.generate_draft(num_reqs, attn_metadata, num_tokens_across_dp=None) # FIXME + return self.draft_tokens[:num_reqs] @triton.jit def _prepare_eagle_inputs_kernel( last_token_indices_ptr, eagle_input_ids_ptr, + eagle_positions_ptr, target_input_ids_ptr, + target_positions_ptr, idx_mapping_ptr, last_sampled_ptr, next_prefill_tokens_ptr, @@ -175,9 +353,16 @@ def _prepare_eagle_inputs_kernel( tl.store(last_token_indices_ptr + batch_idx, last_token_index) tl.store(eagle_input_ids_ptr + last_token_index, next_token) + # Copy positions. + for i in range(0, query_len, BLOCK_SIZE): + block = i + tl.arange(0, BLOCK_SIZE) + mask = block < query_len + target_pos = tl.load(target_positions_ptr + query_start + block, mask=mask) + tl.store(eagle_positions_ptr + query_start + block, target_pos, mask=mask) + def prepare_eagle_inputs( - eagle_input_ids: torch.Tensor, + input_buffers: InputBuffers, input_batch: InputBatch, # [num_reqs] num_sampled: torch.Tensor, @@ -192,12 +377,14 @@ def prepare_eagle_inputs( last_token_indices = torch.empty( num_reqs, dtype=torch.int64, - device=eagle_input_ids.device, + device=num_sampled.device, ) _prepare_eagle_inputs_kernel[(num_reqs,)]( last_token_indices, - eagle_input_ids, + input_buffers.input_ids.gpu, + input_buffers.positions, input_batch.input_ids, + input_batch.positions, input_batch.idx_mapping, last_sampled, next_prefill_tokens, @@ -207,3 +394,174 @@ def prepare_eagle_inputs( BLOCK_SIZE=1024, ) return last_token_indices + + +@triton.jit +def _prepare_eagle_docode_kernel( + draft_tokens_ptr, + output_hidden_states_ptr, + output_hidden_states_stride, + last_token_indices_ptr, + target_seq_lens_ptr, + num_rejected_ptr, + input_ids_ptr, + positions_ptr, + input_hidden_states_ptr, + input_hidden_states_stride, + query_start_loc_ptr, + seq_lens_ptr, + hidden_size, + max_model_len, + max_num_reqs, + BLOCK_SIZE: tl.constexpr, +): + req_idx = tl.program_id(0) + num_reqs = tl.num_programs(0) - 1 + if req_idx == num_reqs: + # Compute query_start_loc. Pad it with the last query_start_loc + # for CUDA graphs. + for i in range(0, max_num_reqs + 1, BLOCK_SIZE): + block = i + tl.arange(0, BLOCK_SIZE) + q = tl.where(block < num_reqs, block, num_reqs) + mask = block < max_num_reqs + 1 + tl.store(query_start_loc_ptr + block, q, mask=mask) + # Pad seq_lens for CUDA graphs. + for i in range(req_idx, max_num_reqs, BLOCK_SIZE): + block = i + tl.arange(0, BLOCK_SIZE) + mask = block < max_num_reqs + tl.store(seq_lens_ptr + block, 0, mask=mask) + return + + # draft token -> input id. + draft_token = tl.load(draft_tokens_ptr + req_idx) + tl.store(input_ids_ptr + req_idx, draft_token) + + # output hidden states -> input hidden states. + src_idx = tl.load(last_token_indices_ptr + req_idx) + for i in range(0, hidden_size, BLOCK_SIZE): + block = i + tl.arange(0, BLOCK_SIZE) + mask = block < hidden_size + output_hidden_states = tl.load( + output_hidden_states_ptr + src_idx * output_hidden_states_stride + block, + mask=mask, + ) + tl.store( + input_hidden_states_ptr + req_idx * input_hidden_states_stride + block, + output_hidden_states, + mask=mask, + ) + + # Compute position and seq_lens. + # NOTE(woosuk): To prevent out-of-range access, we clamp these values + # if they reach the max model length. + position = tl.load(positions_ptr + req_idx) + position = tl.minimum(position + 1, max_model_len - 1) + tl.store(positions_ptr + req_idx, position) + + target_seq_len = tl.load(target_seq_lens_ptr + req_idx) + num_rejected = tl.load(num_rejected_ptr + req_idx) + seq_len = target_seq_len - num_rejected + seq_len = tl.minimum(seq_len + 1, max_model_len) + tl.store(seq_lens_ptr + req_idx, seq_len) + + +def prepare_eagle_decode( + draft_tokens: torch.Tensor, + output_hidden_states: torch.Tensor, + last_token_indices: torch.Tensor, + target_seq_lens: torch.Tensor, + num_rejected: torch.Tensor, + input_buffers: InputBuffers, + input_hidden_states: torch.Tensor, + max_model_len: int, + max_num_reqs: int, +): + num_reqs = draft_tokens.shape[0] + hidden_size = output_hidden_states.shape[-1] + _prepare_eagle_docode_kernel[(num_reqs + 1,)]( + draft_tokens, + output_hidden_states, + output_hidden_states.stride(0), + last_token_indices, + target_seq_lens, + num_rejected, + input_buffers.input_ids.gpu, + input_buffers.positions, + input_hidden_states, + input_hidden_states.stride(0), + input_buffers.query_start_loc.gpu, + input_buffers.seq_lens, + hidden_size, + max_model_len, + max_num_reqs, + BLOCK_SIZE=1024, + ) + + +@triton.jit +def _update_eagle_inputs_kernel( + input_ids_ptr, + positions_ptr, + input_hidden_states_ptr, + input_hidden_states_stride, + seq_lens_ptr, + max_model_len, + draft_tokens_ptr, + output_hidden_states_ptr, + output_hidden_states_stride, + hidden_size, + BLOCK_SIZE: tl.constexpr, +): + req_idx = tl.program_id(0) + + # Draft token -> Input ID. + draft_token = tl.load(draft_tokens_ptr + req_idx) + tl.store(input_ids_ptr + req_idx, draft_token) + + # Output hidden states -> Input hidden states. + for i in range(0, hidden_size, BLOCK_SIZE): + block = i + tl.arange(0, BLOCK_SIZE) + mask = block < hidden_size + output_hidden_states = tl.load( + output_hidden_states_ptr + req_idx * output_hidden_states_stride + block, + mask=mask, + ) + tl.store( + input_hidden_states_ptr + req_idx * input_hidden_states_stride + block, + output_hidden_states, + mask=mask, + ) + + # Increment position and seq_lens. + # NOTE(woosuk): To prevent out-of-range access, we clamp these values + # if they reach the max model length. + position = tl.load(positions_ptr + req_idx) + position = tl.minimum(position + 1, max_model_len - 1) + tl.store(positions_ptr + req_idx, position) + + seq_len = tl.load(seq_lens_ptr + req_idx) + seq_len = tl.minimum(seq_len + 1, max_model_len) + tl.store(seq_lens_ptr + req_idx, seq_len) + + +def update_eagle_inputs( + draft_tokens: torch.Tensor, + output_hidden_states: torch.Tensor, + input_buffers: InputBuffers, + hidden_states: torch.Tensor, + max_model_len: int, +): + num_reqs, hidden_size = output_hidden_states.shape + _update_eagle_inputs_kernel[(num_reqs,)]( + input_buffers.input_ids.gpu, + input_buffers.positions, + hidden_states, + hidden_states.stride(0), + input_buffers.seq_lens, + max_model_len, + draft_tokens, + output_hidden_states, + output_hidden_states.stride(0), + hidden_size, + BLOCK_SIZE=1024, + ) diff --git a/vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py b/vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py new file mode 100644 index 0000000000000..a6f50d68cc684 --- /dev/null +++ b/vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py @@ -0,0 +1,112 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Callable + +import torch + +from vllm.config import VllmConfig +from vllm.config.compilation import CUDAGraphMode +from vllm.v1.attention.backends.utils import AttentionMetadataBuilder +from vllm.v1.kv_cache_interface import KVCacheConfig +from vllm.v1.worker.gpu.block_table import BlockTables +from vllm.v1.worker.gpu.cudagraph_utils import ( + capture_graphs, + get_cudagraph_sizes, + prepare_inputs_to_capture, +) +from vllm.v1.worker.gpu.dp_utils import make_num_tokens_across_dp +from vllm.v1.worker.gpu.input_batch import InputBuffers + + +class EagleCudaGraphManager: + def __init__( + self, + vllm_config: VllmConfig, + device: torch.device, + ): + self.vllm_config = vllm_config + self.scheduler_config = vllm_config.scheduler_config + self.device = device + + self.max_model_len = vllm_config.model_config.max_model_len + self.max_num_reqs = self.scheduler_config.max_num_seqs + self.max_num_tokens = self.scheduler_config.max_num_batched_tokens + self.dp_size = vllm_config.parallel_config.data_parallel_size + self.compilation_config = vllm_config.compilation_config + assert self.compilation_config is not None + + if self.compilation_config.cudagraph_mode is None: + self.cudagraph_mode = CUDAGraphMode.NONE + else: + self.cudagraph_mode = self.compilation_config.cudagraph_mode + if self.cudagraph_mode == CUDAGraphMode.FULL: + # NOTE(woosuk): For Eagle, we only use CUDA graphs for decode. + self.cudagraph_mode = CUDAGraphMode.FULL_DECODE_ONLY + + self.cudagraph_sizes = get_cudagraph_sizes( + self.compilation_config.cudagraph_capture_sizes, + self.max_num_reqs, + self.max_num_tokens, + self.cudagraph_mode, + ) + + self.graphs: dict[int, torch.cuda.CUDAGraph] = {} + self.pool = torch.cuda.graph_pool_handle() + + def get_cudagraph_size(self, num_tokens: int) -> int | None: + return self.cudagraph_sizes.get(num_tokens) + + def capture_graph( + self, + num_tokens: int, + generate_fn: Callable, + input_buffers: InputBuffers, + block_tables: BlockTables, + attn_metadata_builders: list[AttentionMetadataBuilder], + kv_cache_config: KVCacheConfig, + ) -> None: + num_reqs = min(num_tokens, self.max_num_reqs) + attn_metadata = prepare_inputs_to_capture( + num_reqs, + num_tokens, + input_buffers, + block_tables, + attn_metadata_builders, + self.max_model_len, + kv_cache_config, + ) + num_tokens_across_dp = make_num_tokens_across_dp(self.dp_size, num_tokens) + + # Warm up. + generate_fn(num_tokens, attn_metadata, num_tokens_across_dp) + + # Capture the graph. + assert num_tokens not in self.graphs + graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(graph, self.pool): + generate_fn(num_tokens, attn_metadata, num_tokens_across_dp) + self.graphs[num_tokens] = graph + + @torch.inference_mode() + def capture( + self, + generate_fn: Callable, + input_buffers: InputBuffers, + block_tables: BlockTables, + attn_metadata_builders: list[AttentionMetadataBuilder], + kv_cache_config: KVCacheConfig, + ) -> None: + capture_graphs( + self.cudagraph_sizes, + self.device, + self.capture_graph, + generate_fn=generate_fn, + input_buffers=input_buffers, + block_tables=block_tables, + attn_metadata_builders=attn_metadata_builders, + kv_cache_config=kv_cache_config, + ) + + def run(self, num_tokens: int) -> None: + assert num_tokens in self.graphs + self.graphs[num_tokens].replay() From 00d3310d2d00d021d2e8f5f00e31b51d30f0413e Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 27 Nov 2025 17:36:18 +0800 Subject: [PATCH 026/197] [Bugfix] Update Ultravox compatibility (#29588) Signed-off-by: DarkLight1337 --- vllm/model_executor/models/ultravox.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index bb0f6bd036f14..26a8355cd22b5 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -116,7 +116,12 @@ class UltravoxProcessingInfo(BaseProcessingInfo): def get_feature_extractor(self, **kwargs: object) -> WhisperFeatureExtractor: hf_processor = self.get_hf_processor(**kwargs) + + # Changed in https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/commit/9a3c571b8fdaf1e66dd3ea61bbcb6db5c70a438e audio_processor = hf_processor.audio_processor # type: ignore + if isinstance(audio_processor, WhisperFeatureExtractor): + return audio_processor + feature_extractor = audio_processor.feature_extractor # type: ignore assert isinstance(feature_extractor, WhisperFeatureExtractor) return feature_extractor From 0838b52e2eff77d1aaf4ee9d0da19522b9a5749c Mon Sep 17 00:00:00 2001 From: Morrison Turnansky Date: Thu, 27 Nov 2025 04:55:58 -0500 Subject: [PATCH 027/197] [Frontend][torch.compile] CompilationConfig Overhaul (#20283): Set up -O infrastructure (#26847) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: morrison-turnansky Signed-off-by: adabeyta Signed-off-by: Morrison Turnansky Co-authored-by: adabeyta Co-authored-by: Luka Govedič Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/design/optimization_levels.md | 69 ++++ tests/compile/test_config.py | 4 +- tests/engine/test_arg_utils.py | 57 +++- .../model_executor/test_enabled_custom_ops.py | 8 +- tests/test_config.py | 307 +++++++++++++++++- tests/utils_/test_argparse_utils.py | 10 +- tests/v1/cudagraph/test_cudagraph_mode.py | 4 +- vllm/config/compilation.py | 75 ++++- vllm/config/model.py | 8 + vllm/config/vllm.py | 223 ++++++++++++- vllm/engine/arg_utils.py | 8 +- vllm/utils/argparse_utils.py | 24 +- vllm/v1/worker/gpu/cudagraph_utils.py | 2 +- 13 files changed, 735 insertions(+), 64 deletions(-) create mode 100644 docs/design/optimization_levels.md diff --git a/docs/design/optimization_levels.md b/docs/design/optimization_levels.md new file mode 100644 index 0000000000000..940286071ef3c --- /dev/null +++ b/docs/design/optimization_levels.md @@ -0,0 +1,69 @@ + + +# Optimization Levels + +## Overview + +vLLM now supports optimization levels (`-O0`, `-O1`, `-O2`, `-O3`). Optimization levels provide an intuitive mechnaism for users to trade startup time for performance. Higher levels have better performance but worse startup time. These optimization levels have associated defaults to help users get desired out of the box performance. Importantly, defaults set by optimization levels are purely defaults; explicit user settings will not be overwritten. + +## Level Summaries and Usage Examples +```bash +# CLI usage +python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O0 + +# Python API usage +from vllm.entrypoints.llm import LLM + +llm = LLM( + model="RedHatAI/Llama-3.2-1B-FP8", + optimization_level=0 +) +``` + +#### `-O1`: Quick Optimizations +- **Startup**: Moderate startup time +- **Performance**: Inductor compilation, CUDAGraphMode.PIECEWISE +- **Use case**: Balance for most development scenarios + +```bash +# CLI usage +python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O1 + +# Python API usage +from vllm.entrypoints.llm import LLM + +llm = LLM( + model="RedHatAI/Llama-3.2-1B-FP8", + optimization_level=1 +) +``` + +#### `-O2`: Full Optimizations (Default) +- **Startup**: Longer startup time +- **Performance**: `-O1` + CUDAGraphMode.FULL_AND_PIECEWISE +- **Use case**: Production workloads where performance is important. This is the default use case. It is also very similar to the previous default. The primary difference is that noop & fusion flags are enabled. + +```bash +# CLI usage (default, so optional) +python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O2 + +# Python API usage +from vllm.entrypoints.llm import LLM + +llm = LLM( + model="RedHatAI/Llama-3.2-1B-FP8", + optimization_level=2 # This is the default +) +``` + +#### `-O3`: Full Optimization +Still in development. Added infrastructure to prevent changing API in future +release. Currently behaves the same O2. + +## Troubleshooting + +### Common Issues + +1. **Startup Time Too Long**: Use `-O0` or `-O1` for faster startup +2. **Compilation Errors**: Use `debug_dump_path` for additional debugging information +3. **Performance Issues**: Ensure using `-O2` for production \ No newline at end of file diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py index 1e8a882a7f3eb..a9e5ccee520e3 100644 --- a/tests/compile/test_config.py +++ b/tests/compile/test_config.py @@ -172,8 +172,8 @@ def test_splitting_ops_dynamic(): config = VllmConfig() # Default V1 config leaves cudagraph mode unset; splitting ops are only # populated when the engine decides to use piecewise compilation. - assert config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE - assert not config.compilation_config.splitting_ops_contain_attention() + assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE + assert config.compilation_config.splitting_ops_contain_attention() # When use_inductor_graph_partition=True config = VllmConfig( diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index be926764e4948..0077609b2f365 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -222,6 +222,47 @@ def test_media_io_kwargs_parser(arg, expected): assert args.media_io_kwargs == expected +@pytest.mark.parametrize( + ("args", "expected"), + [ + (["-O", "1"], "1"), + (["-O", "2"], "2"), + (["-O", "3"], "3"), + (["-O0"], "0"), + (["-O1"], "1"), + (["-O2"], "2"), + (["-O3"], "3"), + ], +) +def test_optimization_level(args, expected): + """ + Test space-separated optimization levels (-O 1, -O 2, -O 3) map to + optimization_level. + """ + parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) + parsed_args = parser.parse_args(args) + assert parsed_args.optimization_level == expected + assert parsed_args.compilation_config.mode is None + + +@pytest.mark.parametrize( + ("args", "expected"), + [ + (["-O.mode=0"], 0), + (["-O.mode=1"], 1), + (["-O.mode=2"], 2), + (["-O.mode=3"], 3), + ], +) +def test_mode_parser(args, expected): + """ + Test compilation config modes (-O.mode=int) map to compilation_config. + """ + parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) + parsed_args = parser.parse_args(args) + assert parsed_args.compilation_config.mode == expected + + def test_compilation_config(): parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) @@ -229,22 +270,6 @@ def test_compilation_config(): args = parser.parse_args([]) assert args.compilation_config == CompilationConfig() - # set to O3 - args = parser.parse_args(["-O0"]) - assert args.compilation_config.mode == 0 - - # set to O 3 (space) - args = parser.parse_args(["-O", "1"]) - assert args.compilation_config.mode == 1 - - # set to O 3 (equals) - args = parser.parse_args(["-O=2"]) - assert args.compilation_config.mode == 2 - - # set to O.mode 3 - args = parser.parse_args(["-O.mode", "3"]) - assert args.compilation_config.mode == 3 - # set to string form of a dict args = parser.parse_args( [ diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py index 9121284de85b7..7d95dcddca711 100644 --- a/tests/model_executor/test_enabled_custom_ops.py +++ b/tests/model_executor/test_enabled_custom_ops.py @@ -5,7 +5,12 @@ import pytest import torch from vllm._aiter_ops import rocm_aiter_ops -from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config +from vllm.config import ( + CompilationConfig, + VllmConfig, + get_cached_compilation_config, + set_current_vllm_config, +) from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.activation import ( GeluAndMul, @@ -86,6 +91,7 @@ def test_enabled_ops( backend=backend, mode=compilation_mode, custom_ops=custom_ops ) ) + get_cached_compilation_config.cache_clear() with set_current_vllm_config(vllm_config): assert CustomOp.default_on() == default_on diff --git a/tests/test_config.py b/tests/test_config.py index 16f68d18fc68b..080e4d2afacc6 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -8,9 +8,20 @@ from unittest.mock import patch import pytest from vllm.compilation.backends import VllmBackend -from vllm.config import ModelConfig, PoolerConfig, VllmConfig, update_config +from vllm.config import ( + CompilationConfig, + ModelConfig, + PoolerConfig, + VllmConfig, + update_config, +) +from vllm.config.compilation import CompilationMode, CUDAGraphMode from vllm.config.load import LoadConfig from vllm.config.utils import get_field +from vllm.config.vllm import ( + OPTIMIZATION_LEVEL_TO_CONFIG, + OptimizationLevel, +) from vllm.model_executor.layers.pooler import PoolingType from vllm.platforms import current_platform @@ -235,6 +246,43 @@ def test_default_pooling_type(model_id, default_pooling_type, pooling_type): assert model_config.pooler_config.pooling_type == pooling_type +@pytest.mark.parametrize( + ("model_id", "expected_is_moe_model"), + [ + ("RedHatAI/Qwen3-8B-speculator.eagle3", False), + ("RedHatAI/Llama-3.1-8B-Instruct-NVFP4", False), + ("RedHatAI/Llama-3.2-1B-FP8", False), + ("RedHatAI/Mistral-Small-24B-Instruct-2501-quantized.w8a8", False), + ("RedHatAI/gpt-oss-20b", True), + ("RedHatAI/DeepSeek-V2.5-1210-FP8", True), + ("RedHatAI/Llama-4-Scout-17B-16E-Instruct", True), + ("RedHatAI/Mixtral-8x7B-Instruct-v0.1", True), + ], +) +def test_moe_model_detection(model_id, expected_is_moe_model): + model_config = ModelConfig(model_id) + # Just check that is_moe_model field exists and is a boolean + assert model_config.is_model_moe() == expected_is_moe_model + + +@pytest.mark.parametrize( + ("model_id", "quantized"), + [ + ("RedHatAI/Qwen3-8B-speculator.eagle3", False), + ("RedHatAI/Llama-3.1-8B-Instruct-NVFP4", True), + ("RedHatAI/Llama-3.2-1B-FP8", True), + ("RedHatAI/Mistral-Small-24B-Instruct-2501-quantized.w8a8", True), + ("RedHatAI/gpt-oss-20b", True), + ("RedHatAI/DeepSeek-V2.5-1210-FP8", True), + ("RedHatAI/Mixtral-8x7B-Instruct-v0.1", False), + ], +) +def test_is_quantized(model_id, quantized): + model_config = ModelConfig(model_id) + # Just check that quantized field exists and is a boolean + assert model_config.is_quantized() == quantized + + @pytest.mark.skipif( current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm." ) @@ -552,3 +600,260 @@ def test_s3_url_different_models_create_different_directories(mock_pull_files): assert os.path.exists(config1.tokenizer) and os.path.isdir(config1.tokenizer) assert os.path.exists(config2.model) and os.path.isdir(config2.model) assert os.path.exists(config2.tokenizer) and os.path.isdir(config2.tokenizer) + + +@pytest.mark.parametrize( + ("backend", "custom_ops", "expected"), + [ + ("eager", [], True), + ("eager", ["+fused_layernorm"], True), + ("eager", ["all", "-fused_layernorm"], False), + ("inductor", [], False), + ("inductor", ["none", "+fused_layernorm"], True), + ("inductor", ["none", "-fused_layernorm"], False), + ], +) +def test_is_custom_op_enabled(backend: str, custom_ops: list[str], expected: bool): + """Test that is_custom_op_enabled works correctly.""" + config = VllmConfig( + compilation_config=CompilationConfig(backend=backend, custom_ops=custom_ops) + ) + assert config.compilation_config.is_custom_op_enabled("fused_layernorm") is expected + + +def test_vllm_config_defaults_are_none(): + """Verify that optimization-level defaults are None when not set by user.""" + # Test all optimization levels to ensure defaults work correctly + for opt_level in OptimizationLevel: + config = object.__new__(VllmConfig) + config.compilation_config = CompilationConfig() + config.optimization_level = opt_level + config.model_config = None + + # Use the global optimization level defaults + default_config = OPTIMIZATION_LEVEL_TO_CONFIG[opt_level] + + # Verify that all pass_config values are None before defaults are applied + for pass_k in default_config["compilation_config"]["pass_config"]: + assert getattr(config.compilation_config.pass_config, pass_k) is None + + # Verify that other config values are None before defaults are applied + for k in default_config["compilation_config"]: + if k != "pass_config": + assert getattr(config.compilation_config, k) is None + + +@pytest.mark.parametrize( + ("model_id", "compiliation_config", "optimization_level"), + [ + ( + None, + CompilationConfig(backend="eager", custom_ops=["+quant_fp8"]), + OptimizationLevel.O0, + ), + (None, CompilationConfig(), OptimizationLevel.O0), + (None, CompilationConfig(), OptimizationLevel.O1), + (None, CompilationConfig(), OptimizationLevel.O2), + (None, CompilationConfig(), OptimizationLevel.O3), + ( + "RedHatAI/Qwen3-8B-speculator.eagle3", + CompilationConfig(backend="inductor", custom_ops=["+quant_fp8"]), + OptimizationLevel.O2, + ), + ( + "RedHatAI/Qwen3-8B-speculator.eagle3", + CompilationConfig(), + OptimizationLevel.O0, + ), + ( + "RedHatAI/Qwen3-8B-speculator.eagle3", + CompilationConfig(), + OptimizationLevel.O1, + ), + ( + "RedHatAI/Qwen3-8B-speculator.eagle3", + CompilationConfig(), + OptimizationLevel.O2, + ), + ( + "RedHatAI/Qwen3-8B-speculator.eagle3", + CompilationConfig(), + OptimizationLevel.O3, + ), + ("RedHatAI/DeepSeek-V2.5-1210-FP8", CompilationConfig(), OptimizationLevel.O0), + ("RedHatAI/DeepSeek-V2.5-1210-FP8", CompilationConfig(), OptimizationLevel.O1), + ("RedHatAI/DeepSeek-V2.5-1210-FP8", CompilationConfig(), OptimizationLevel.O2), + ("RedHatAI/DeepSeek-V2.5-1210-FP8", CompilationConfig(), OptimizationLevel.O3), + ], +) +def test_vllm_config_defaults(model_id, compiliation_config, optimization_level): + """Test that optimization-level defaults are correctly applied.""" + + model_config = None + if model_id is not None: + model_config = ModelConfig(model_id) + vllm_config = VllmConfig( + model_config=model_config, + compilation_config=compiliation_config, + optimization_level=optimization_level, + ) + else: + vllm_config = VllmConfig( + compilation_config=compiliation_config, + optimization_level=optimization_level, + ) + # Use the global optimization level defaults + default_config = OPTIMIZATION_LEVEL_TO_CONFIG[optimization_level] + + # Verify pass_config defaults (nested under compilation_config) + pass_config_dict = default_config["compilation_config"]["pass_config"] + for pass_k, pass_v in pass_config_dict.items(): + actual = getattr(vllm_config.compilation_config.pass_config, pass_k) + expected = pass_v(vllm_config) if callable(pass_v) else pass_v + assert actual == expected, ( + f"pass_config.{pass_k}: expected {expected}, got {actual}" + ) + + # Verify other compilation_config defaults + compilation_config_dict = default_config["compilation_config"] + for k, v in compilation_config_dict.items(): + if k != "pass_config": + actual = getattr(vllm_config.compilation_config, k) + expected = v(vllm_config) if callable(v) else v + assert actual == expected, ( + f"compilation_config.{k}: expected {expected}, got {actual}" + ) + + +def test_vllm_config_callable_defaults(): + """Test that callable defaults work in the config system. + + Verifies that lambdas in default configs can inspect VllmConfig properties + (e.g., is_quantized, is_model_moe) to conditionally set optimization flags. + """ + config_no_model = VllmConfig(optimization_level=OptimizationLevel.O2) + + # Callable that checks if model exists + has_model = lambda cfg: cfg.model_config is not None + assert has_model(config_no_model) is False + + # Test with quantized model + quantized_model = ModelConfig("RedHatAI/Llama-3.2-1B-FP8") + config_quantized = VllmConfig( + model_config=quantized_model, optimization_level=OptimizationLevel.O2 + ) + enable_if_quantized = lambda cfg: ( + cfg.model_config is not None and cfg.model_config.is_quantized() + ) + assert enable_if_quantized(config_quantized) is True + assert enable_if_quantized(config_no_model) is False + + # Test with MoE model + moe_model = ModelConfig("deepseek-ai/DeepSeek-V2-Lite") + config_moe = VllmConfig( + model_config=moe_model, optimization_level=OptimizationLevel.O2 + ) + enable_if_sequential = lambda cfg: ( + cfg.model_config is not None and not cfg.model_config.is_model_moe() + ) + assert enable_if_sequential(config_moe) is False + assert enable_if_sequential(config_quantized) is True + + +def test_vllm_config_explicit_overrides(): + """Test that explicit property overrides work correctly with callable defaults. + + When users explicitly set configuration properties, those values + take precedence over callable defaults, across different models and + optimization levels. + """ + from vllm.config.compilation import PassConfig + + quantized_model = ModelConfig("RedHatAI/Llama-3.2-1B-FP8") + moe_model = ModelConfig("deepseek-ai/DeepSeek-V2-Lite") + regular_model = ModelConfig("Qwen/Qwen1.5-7B") + + # Explicit compilation mode override on O0 (where default is NONE) + compilation_config = CompilationConfig(mode=CompilationMode.VLLM_COMPILE) + config = VllmConfig( + optimization_level=OptimizationLevel.O0, + compilation_config=compilation_config, + ) + assert config.compilation_config.mode == CompilationMode.VLLM_COMPILE + assert config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE + + # Explicit pass config flags to override defaults + pass_config = PassConfig(enable_noop=True, enable_attn_fusion=True) + compilation_config = CompilationConfig(pass_config=pass_config) + config = VllmConfig( + optimization_level=OptimizationLevel.O0, + compilation_config=compilation_config, + ) + assert config.compilation_config.pass_config.enable_noop is True + assert config.compilation_config.pass_config.enable_attn_fusion is True + + # Explicit cudagraph mode override on quantized model at O2 + pass_config = PassConfig(enable_async_tp=True) + compilation_config = CompilationConfig( + cudagraph_mode=CUDAGraphMode.NONE, pass_config=pass_config + ) + config = VllmConfig( + model_config=quantized_model, + optimization_level=OptimizationLevel.O2, + compilation_config=compilation_config, + ) + assert config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE + assert config.compilation_config.pass_config.enable_async_tp is True + # Mode should still use default for O2 + assert config.compilation_config.mode == CompilationMode.VLLM_COMPILE + + # Different optimization levels with same model + config_o0 = VllmConfig( + model_config=regular_model, optimization_level=OptimizationLevel.O0 + ) + config_o2 = VllmConfig( + model_config=regular_model, optimization_level=OptimizationLevel.O2 + ) + assert config_o0.compilation_config.mode == CompilationMode.NONE + assert config_o2.compilation_config.mode == CompilationMode.VLLM_COMPILE + assert config_o0.compilation_config.cudagraph_mode == CUDAGraphMode.NONE + assert ( + config_o2.compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE + ) + + # Same optimization level across different model types + config_moe_o2 = VllmConfig( + model_config=moe_model, optimization_level=OptimizationLevel.O2 + ) + config_regular_o2 = VllmConfig( + model_config=regular_model, optimization_level=OptimizationLevel.O2 + ) + config_quantized_o2 = VllmConfig( + model_config=quantized_model, optimization_level=OptimizationLevel.O2 + ) + # All should have same base compilation settings at O2 + assert config_moe_o2.compilation_config.mode == CompilationMode.VLLM_COMPILE + assert config_regular_o2.compilation_config.mode == CompilationMode.VLLM_COMPILE + assert config_quantized_o2.compilation_config.mode == CompilationMode.VLLM_COMPILE + assert ( + config_moe_o2.compilation_config.cudagraph_mode + == CUDAGraphMode.FULL_AND_PIECEWISE + ) + assert ( + config_regular_o2.compilation_config.cudagraph_mode + == CUDAGraphMode.FULL_AND_PIECEWISE + ) + + # Override one field but not others + pass_config = PassConfig(enable_noop=False) + compilation_config = CompilationConfig(pass_config=pass_config) + config = VllmConfig( + model_config=regular_model, + optimization_level=OptimizationLevel.O2, + compilation_config=compilation_config, + ) + # Explicit override should be respected + assert config.compilation_config.pass_config.enable_noop is False + # Other fields should still use defaults + assert config.compilation_config.mode == CompilationMode.VLLM_COMPILE + assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE diff --git a/tests/utils_/test_argparse_utils.py b/tests/utils_/test_argparse_utils.py index 32d4eca541356..c0519155c4ba8 100644 --- a/tests/utils_/test_argparse_utils.py +++ b/tests/utils_/test_argparse_utils.py @@ -28,6 +28,7 @@ def parser(): parser.add_argument("--enable-feature", action="store_true") parser.add_argument("--hf-overrides", type=json.loads) parser.add_argument("-O", "--compilation-config", type=json.loads) + parser.add_argument("--optimization-level", type=int) return parser @@ -217,8 +218,8 @@ def test_dict_args(parser): "key15": "-minus.and.dot", }, } + assert parsed_args.optimization_level == 1 assert parsed_args.compilation_config == { - "mode": 1, "use_inductor_graph_partition": True, "backend": "custom", "custom_ops": ["-quant_fp8", "+silu_mul", "-rms_norm"], @@ -241,12 +242,13 @@ def test_duplicate_dict_args(caplog_vllm, parser): parsed_args = parser.parse_args(args) # Should be the last value assert parsed_args.hf_overrides == {"key1": "val2"} - assert parsed_args.compilation_config == {"mode": 3} + assert parsed_args.optimization_level == 3 + assert parsed_args.compilation_config == {"mode": 2} assert len(caplog_vllm.records) == 1 assert "duplicate" in caplog_vllm.text assert "--hf-overrides.key1" in caplog_vllm.text - assert "-O.mode" in caplog_vllm.text + assert "--optimization-level" in caplog_vllm.text def test_model_specification( @@ -383,7 +385,7 @@ def test_compilation_mode_string_values(parser): assert args.compilation_config == {"mode": 0} args = parser.parse_args(["-O3"]) - assert args.compilation_config == {"mode": 3} + assert args.optimization_level == 3 args = parser.parse_args(["-O.mode=NONE"]) assert args.compilation_config == {"mode": "NONE"} diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py index 7f9c2a0571c3c..12621d493e549 100644 --- a/tests/v1/cudagraph/test_cudagraph_mode.py +++ b/tests/v1/cudagraph/test_cudagraph_mode.py @@ -117,9 +117,9 @@ else: combo_cases_2 = [ ("FA2", "FULL", CompilationMode.NONE, True), ("FA2", "FULL", CompilationMode.VLLM_COMPILE, True), - ("FA2", "PIECEWISE", CompilationMode.NONE, False), + ("FA2", "PIECEWISE", CompilationMode.NONE, True), ("FA2", "PIECEWISE", CompilationMode.VLLM_COMPILE, True), - ("FA2", "FULL_AND_PIECEWISE", CompilationMode.NONE, False), + ("FA2", "FULL_AND_PIECEWISE", CompilationMode.NONE, True), ("FA2", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True), ("FA2", "FULL_DECODE_ONLY", CompilationMode.NONE, True), ("FA2", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True), diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 865d045676d14..da2c100dae3dc 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -8,7 +8,7 @@ from dataclasses import asdict, field from pathlib import Path from typing import TYPE_CHECKING, Any, ClassVar, Literal -from pydantic import TypeAdapter, field_validator +from pydantic import Field, TypeAdapter, field_validator from pydantic.dataclasses import dataclass import vllm.envs as envs @@ -97,19 +97,25 @@ class PassConfig: This is separate from general `CompilationConfig` so that inductor passes don't all have access to full configuration - that would create a cycle as - the `PassManager` is set as a property of config.""" + the `PassManager` is set as a property of config. - enable_fusion: bool = False + You must pass PassConfig to VLLMConfig constructor via the CompilationConfig + constructor. VLLMConfig's post_init does further initialization. + If used outside of the VLLMConfig, some fields may be left in an + improper state. + """ + + enable_fusion: bool = Field(default=None) """Whether to enable the custom fusion (RMSNorm/SiluMul+quant) pass.""" - enable_attn_fusion: bool = False + enable_attn_fusion: bool = Field(default=None) """Whether to enable the custom attention+quant fusion pass.""" - enable_noop: bool = False + enable_noop: bool = Field(default=None) """Whether to enable the custom no-op elimination pass.""" - enable_sequence_parallelism: bool = False + enable_sequence_parallelism: bool = Field(default=None) """Whether to enable sequence parallelism.""" - enable_async_tp: bool = False + enable_async_tp: bool = Field(default=None) """Whether to enable async TP.""" - enable_fi_allreduce_fusion: bool = False + enable_fi_allreduce_fusion: bool = Field(default=None) """Whether to enable flashinfer allreduce fusion.""" fi_allreduce_fusion_max_size_mb: float | None = None """The threshold of the communicated tensor sizes under which @@ -167,6 +173,22 @@ class PassConfig: """ return InductorPass.hash_dict(asdict(self)) + @field_validator( + "enable_fusion", + "enable_attn_fusion", + "enable_noop", + "enable_sequence_parallelism", + "enable_async_tp", + "enable_fi_allreduce_fusion", + mode="wrap", + ) + @classmethod + def _skip_none_validation(cls, value: Any, handler: Callable) -> Any: + """Skip validation if the value is `None` when initialisation is delayed.""" + if value is None: + return value + return handler(value) + def __post_init__(self) -> None: if not self.enable_noop: if self.enable_fusion: @@ -243,7 +265,13 @@ class DynamicShapesConfig: @config @dataclass class CompilationConfig: - """Configuration for compilation. It has three parts: + """Configuration for compilation. + + You must pass CompilationConfig to VLLMConfig constructor. + VLLMConfig's post_init does further initialization. If used outside of the + VLLMConfig, some fields will be left in an improper state. + + It has three parts: - Top-level Compilation control: - [`mode`][vllm.config.CompilationConfig.mode] @@ -282,14 +310,14 @@ class CompilationConfig: """ # Top-level Compilation control - level: int | None = None + level: int = Field(default=None) """ Level is deprecated and will be removed in the next release, either 0.12.0 or 0.11.2 whichever is soonest. Please use mode. Currently all levels are mapped to mode. """ # Top-level Compilation control - mode: CompilationMode | None = None + mode: CompilationMode = Field(default=None) """The compilation approach used for torch.compile-based compilation of the model. @@ -390,7 +418,7 @@ class CompilationConfig: constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`.""" # CudaGraph compilation - cudagraph_mode: CUDAGraphMode | None = None + cudagraph_mode: CUDAGraphMode = Field(default=None) """ The mode of the cudagraph: @@ -452,7 +480,7 @@ class CompilationConfig: When `enable_lora` is False, this option has no effect. """ - use_inductor_graph_partition: bool = False + use_inductor_graph_partition: bool = Field(default=None) """Use inductor graph partition to split the graph at cudagraph_unsafe ops. This partition happens at inductor codegen time after all passes and fusions are finished. It generates a single `call` function which wraps @@ -648,6 +676,20 @@ class CompilationConfig: ) return value + @field_validator( + "level", + "mode", + "cudagraph_mode", + "use_inductor_graph_partition", + mode="wrap", + ) + @classmethod + def _skip_none_validation(cls, value: Any, handler: Callable) -> Any: + """Skip validation if the value is `None` when initialisation is delayed.""" + if value is None: + return value + return handler(value) + def __post_init__(self) -> None: if self.level is not None: logger.warning( @@ -948,6 +990,13 @@ class CompilationConfig: op, ) + def is_custom_op_enabled(self, op: str) -> bool: + if "all" in self.custom_ops: + return f"-{op}" not in self.custom_ops + + assert "none" in self.custom_ops + return f"+{op}" in self.custom_ops + def adjust_cudagraph_sizes_for_spec_decode( self, uniform_decode_query_len: int, tensor_parallel_size: int ): diff --git a/vllm/config/model.py b/vllm/config/model.py index 25972f097f53d..84311596b660c 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -1752,6 +1752,14 @@ class ModelConfig: logger.info("Using max model len %s", max_model_len) return max_model_len + def is_model_moe( + self, + ) -> bool: + return self.get_num_experts() > 1 + + def is_quantized(self) -> bool: + return getattr(self.hf_config, "quantization_config", None) is not None + def get_served_model_name(model: str, served_model_name: str | list[str] | None): """ diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 9342564aa3d3f..c576275e80fe3 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -9,8 +9,9 @@ import tempfile import threading import time from contextlib import contextmanager -from dataclasses import replace +from dataclasses import is_dataclass, replace from datetime import datetime +from enum import IntEnum from functools import lru_cache from pathlib import Path from typing import TYPE_CHECKING, Any, TypeVar, get_args @@ -57,6 +58,103 @@ else: logger = init_logger(__name__) +class OptimizationLevel(IntEnum): + """Optimization level enum.""" + + O0 = 0 + """O0 : No optimization. no compilation, no cudagraphs, no other + optimization, just starting up immediately""" + O1 = 1 + """O1: Quick optimizations. Dynamo+Inductor compilation and Piecewise + cudagraphs""" + O2 = 2 + """O2: Full optimizations. -O1 as well as Full and Piecewise cudagraphs.""" + O3 = 3 + """O3: Currently the same as -O2s.""" + + +IS_QUANTIZED = False +IS_DENSE = False +# The optimizations that depend on these properties currently set to False +# in all cases. +# if model_config is not None: +# IS_QUANTIZED = lambda c: c.model_config.is_quantized() +# IS_DENSE = lambda c: not c.model_config.is_model_moe() +# See https://github.com/vllm-project/vllm/issues/25689. + + +def enable_fusion(cfg: "VllmConfig") -> bool: + """Returns True if RMS norm or quant FP8 is enabled.""" + return cfg.compilation_config.is_custom_op_enabled( + "rms_norm" + ) or cfg.compilation_config.is_custom_op_enabled("quant_fp8") + + +OPTIMIZATION_LEVEL_00 = { + "compilation_config": { + "pass_config": { + "enable_noop": False, + "enable_fusion": False, + "enable_fi_allreduce_fusion": False, + "enable_attn_fusion": False, + "enable_sequence_parallelism": False, + "enable_async_tp": False, + }, + "cudagraph_mode": CUDAGraphMode.NONE, + "use_inductor_graph_partition": False, + }, +} +OPTIMIZATION_LEVEL_01 = { + "compilation_config": { + "pass_config": { + "enable_noop": True, + "enable_fusion": enable_fusion, + "enable_fi_allreduce_fusion": False, + "enable_attn_fusion": False, + "enable_sequence_parallelism": False, + "enable_async_tp": False, + }, + "cudagraph_mode": CUDAGraphMode.PIECEWISE, + "use_inductor_graph_partition": False, + }, +} +OPTIMIZATION_LEVEL_02 = { + "compilation_config": { + "pass_config": { + "enable_noop": True, + "enable_fusion": enable_fusion, + "enable_fi_allreduce_fusion": False, + "enable_attn_fusion": IS_QUANTIZED, + "enable_sequence_parallelism": IS_DENSE, + "enable_async_tp": IS_DENSE, + }, + "cudagraph_mode": CUDAGraphMode.FULL_AND_PIECEWISE, + "use_inductor_graph_partition": False, + }, +} +OPTIMIZATION_LEVEL_03 = { + "compilation_config": { + "pass_config": { + "enable_noop": True, + "enable_fusion": enable_fusion, + "enable_fi_allreduce_fusion": False, + "enable_attn_fusion": IS_QUANTIZED, + "enable_sequence_parallelism": IS_DENSE, + "enable_async_tp": IS_DENSE, + }, + "cudagraph_mode": CUDAGraphMode.FULL_AND_PIECEWISE, + "use_inductor_graph_partition": False, + }, +} + +OPTIMIZATION_LEVEL_TO_CONFIG = { + OptimizationLevel.O0: OPTIMIZATION_LEVEL_00, + OptimizationLevel.O1: OPTIMIZATION_LEVEL_01, + OptimizationLevel.O2: OPTIMIZATION_LEVEL_02, + OptimizationLevel.O3: OPTIMIZATION_LEVEL_03, +} + + @config @dataclass(config=ConfigDict(arbitrary_types_allowed=True)) class VllmConfig: @@ -116,6 +214,11 @@ class VllmConfig: you are using. Contents must be hashable.""" instance_id: str = "" """The ID of the vLLM instance.""" + optimization_level: OptimizationLevel = OptimizationLevel.O2 + """The optimization level. These levels trade startup time cost for + performance, with -O0 having the best startup time and -O3 having the best + performance. -02 is used by defult. See OptimizationLevel for full + description.""" def compute_hash(self) -> str: """ @@ -297,6 +400,50 @@ class VllmConfig: return replace(self, model_config=model_config) + def _set_config_default(self, config_obj: Any, key: str, value: Any) -> None: + """Set config attribute to default if not already set by user. + + Args: + config_obj: Configuration object to update. + key: Attribute name. + value: Default value (static or callable). + """ + if getattr(config_obj, key) is None: + # Some config values are known before initialization and are + # hard coded. + # Other values depend on the user given configuration, so they are + # implemented with lambda functions and decided at run time. + setattr(config_obj, key, value(self) if callable(value) else value) + + def _apply_optimization_level_defaults(self, defaults: dict[str, Any]) -> None: + """Apply optimization level defaults using self as root. + + Recursively applies values from defaults into nested config objects. + Only fields present in defaults are overwritten. + + If the user configuration does not specify a value for a default field + and if the default field is still None after all user selections are + applied, then default values will be applied to the field. User speciied + fields will not be overridden by the default. + + Args: + defaults: Dictionary of default values to apply. + """ + + def apply_recursive(config_obj: Any, config_defaults: dict[str, Any]) -> None: + """Recursively apply defaults to config_obj, using self as root.""" + for key, value in config_defaults.items(): + if not hasattr(config_obj, key): + continue + + current = getattr(config_obj, key) + if isinstance(value, dict) and is_dataclass(current): + apply_recursive(current, value) + else: + self._set_config_default(config_obj, key, value) + + apply_recursive(self, defaults) + def _post_init_kv_transfer_config(self) -> None: """Update KVTransferConfig based on top-level configs in VllmConfig. @@ -434,17 +581,47 @@ class VllmConfig: "precision for chunked prefill triton kernels." ) - # If the user does not explicitly set a compilation mode, then - # we use the default mode. The default mode depends on other - # settings (see the below code). + if ( + self.optimization_level > OptimizationLevel.O0 + and self.model_config is not None + and self.model_config.enforce_eager + ): + logger.warning("Enforce eager set, overriding optimization level to -O0") + self.optimization_level = OptimizationLevel.O0 + + if self.compilation_config.backend == "eager" or ( + self.compilation_config.mode is not None + and self.compilation_config.mode != CompilationMode.VLLM_COMPILE + ): + logger.warning( + "Inductor compilation was disabled by user settings," + "Optimizations settings that are only active during" + "Inductor compilation will be ignored." + ) + + def has_blocked_weights(): + if self.quant_config is not None: + if hasattr(self.quant_config, "weight_block_size"): + return self.quant_config.weight_block_size is not None + elif hasattr(self.quant_config, "has_blocked_weights"): + return self.quant_config.has_blocked_weights() + return False + + # Enable quant_fp8 CUDA ops (TODO disable in follow up) + # On H100 the CUDA kernel is faster than + # native implementation + # https://github.com/vllm-project/vllm/issues/25094 + if has_blocked_weights(): + custom_ops = self.compilation_config.custom_ops + if "-quant_fp8" not in custom_ops: + custom_ops.append("+quant_fp8") + if self.compilation_config.mode is None: - if self.model_config is not None and not self.model_config.enforce_eager: + if self.optimization_level > OptimizationLevel.O0: self.compilation_config.mode = CompilationMode.VLLM_COMPILE else: self.compilation_config.mode = CompilationMode.NONE - # If user does not set custom ops via none or all set it here based on - # compilation mode and backend. if all(s not in self.compilation_config.custom_ops for s in ("all", "none")): if ( self.compilation_config.backend == "inductor" @@ -454,23 +631,33 @@ class VllmConfig: else: self.compilation_config.custom_ops.append("all") + default_config = OPTIMIZATION_LEVEL_TO_CONFIG[self.optimization_level] + self._apply_optimization_level_defaults(default_config) + if ( + self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE + and self.compilation_config.mode != CompilationMode.VLLM_COMPILE + ): + logger.info( + "Cudagraph mode %s is not compatible with compilation mode %s." + "Overriding to NONE.", + self.compilation_config.cudagraph_mode, + self.compilation_config.mode, + ) + self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE + # async tp is built on top of sequence parallelism # and requires it to be enabled. if self.compilation_config.pass_config.enable_async_tp: self.compilation_config.pass_config.enable_sequence_parallelism = True + if self.compilation_config.pass_config.enable_sequence_parallelism: + if "-rms_norm" in self.compilation_config.custom_ops: + logger.warning( + "RMS norm force disabled, sequence parallelism might break" + ) + else: + self.compilation_config.custom_ops.append("+rms_norm") if current_platform.support_static_graph_mode(): - # if cudagraph_mode is not explicitly set by users, set default - # value - if self.compilation_config.cudagraph_mode is None: - if self.compilation_config.mode == CompilationMode.VLLM_COMPILE: - # default to full and piecewise for most models - self.compilation_config.cudagraph_mode = ( - CUDAGraphMode.FULL_AND_PIECEWISE - ) - else: - self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE - # if cudagraph_mode has full cudagraphs, we need to check support if self.compilation_config.cudagraph_mode.has_full_cudagraphs(): # decode context parallel does not support full cudagraphs diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 696ff3a1f4024..e4c9a82d25223 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -77,6 +77,7 @@ from vllm.config.observability import DetailedTraceModules from vllm.config.parallel import DistributedExecutorBackend, ExpertPlacementStrategy from vllm.config.scheduler import SchedulerPolicy from vllm.config.utils import get_field +from vllm.config.vllm import OptimizationLevel from vllm.logger import init_logger, suppress_logging from vllm.platforms import CpuArchEnum, current_platform from vllm.plugins import load_general_plugins @@ -560,6 +561,7 @@ class EngineArgs: stream_interval: int = SchedulerConfig.stream_interval kv_sharing_fast_prefill: bool = CacheConfig.kv_sharing_fast_prefill + optimization_level: OptimizationLevel = VllmConfig.optimization_level kv_offloading_size: float | None = CacheConfig.kv_offloading_size kv_offloading_backend: KVOffloadingBackend | None = ( @@ -1114,6 +1116,10 @@ class EngineArgs: "--structured-outputs-config", **vllm_kwargs["structured_outputs_config"] ) + vllm_group.add_argument( + "--optimization-level", **vllm_kwargs["optimization_level"] + ) + # Other arguments parser.add_argument( "--disable-log-stats", @@ -1733,7 +1739,6 @@ class EngineArgs: compilation_config.max_cudagraph_capture_size = ( self.max_cudagraph_capture_size ) - config = VllmConfig( model_config=model_config, cache_config=cache_config, @@ -1750,6 +1755,7 @@ class EngineArgs: kv_events_config=self.kv_events_config, ec_transfer_config=self.ec_transfer_config, additional_config=self.additional_config, + optimization_level=self.optimization_level, ) return config diff --git a/vllm/utils/argparse_utils.py b/vllm/utils/argparse_utils.py index 692e756d19634..b68157f02f6cc 100644 --- a/vllm/utils/argparse_utils.py +++ b/vllm/utils/argparse_utils.py @@ -247,16 +247,16 @@ class FlexibleArgumentParser(ArgumentParser): elif arg.startswith("-O") and arg != "-O" and arg[2] != ".": # allow -O flag to be used without space, e.g. -O3 or -Odecode # -O.<...> handled later - # also handle -O= here - mode = arg[3:] if arg[2] == "=" else arg[2:] - processed_args.append(f"-O.mode={mode}") + # also handle -O= here + optimization_level = arg[3:] if arg[2] == "=" else arg[2:] + processed_args += ["--optimization-level", optimization_level] elif ( arg == "-O" and i + 1 < len(args) and args[i + 1] in {"0", "1", "2", "3"} ): - # Convert -O to -O.mode - processed_args.append("-O.mode") + # Convert -O to --optimization-level + processed_args.append("--optimization-level") else: processed_args.append(arg) @@ -294,10 +294,24 @@ class FlexibleArgumentParser(ArgumentParser): delete = set[int]() dict_args = defaultdict[str, dict[str, Any]](dict) duplicates = set[str]() + # Track regular arguments (non-dict args) for duplicate detection + regular_args_seen = set[str]() for i, processed_arg in enumerate(processed_args): if i in delete: # skip if value from previous arg continue + if processed_arg.startswith("--") and "." not in processed_arg: + if "=" in processed_arg: + arg_name = processed_arg.split("=", 1)[0] + else: + arg_name = processed_arg + + if arg_name in regular_args_seen: + duplicates.add(arg_name) + else: + regular_args_seen.add(arg_name) + continue + if processed_arg.startswith("-") and "." in processed_arg: if "=" in processed_arg: processed_arg, value_str = processed_arg.split("=", 1) diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py index 4fd8eb50a4ea8..eb8e610ae4710 100644 --- a/vllm/v1/worker/gpu/cudagraph_utils.py +++ b/vllm/v1/worker/gpu/cudagraph_utils.py @@ -37,7 +37,7 @@ class CudaGraphManager: self.dp_size = vllm_config.parallel_config.data_parallel_size self.compilation_config = vllm_config.compilation_config assert self.compilation_config is not None - + self.cudagraph_mode: CUDAGraphMode if self.compilation_config.cudagraph_mode is None: self.cudagraph_mode = CUDAGraphMode.NONE else: From 51906c8c559f1d7c23efa667fcb3b7ed79f7fa25 Mon Sep 17 00:00:00 2001 From: maang-h <55082429+maang-h@users.noreply.github.com> Date: Thu, 27 Nov 2025 18:09:24 +0800 Subject: [PATCH 028/197] [Docs] Improve `priority` parameter documentation (#29572) Signed-off-by: maang Signed-off-by: maang-h <55082429+maang-h@users.noreply.github.com> Co-authored-by: Cyrus Leung --- vllm/entrypoints/llm.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 1860f383d45fb..f6ee746789981 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -405,6 +405,9 @@ class LLM: lora_request: LoRA request to use for generation, if any. priority: The priority of the requests, if any. Only applicable when priority scheduling policy is enabled. + If provided, must be a list of integers matching the length + of `prompts`, where each priority value corresponds to the prompt + at the same index. Returns: A list of `RequestOutput` objects containing the From e6d4f3c254a215e75b4d76d531176e242fe62a1f Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 27 Nov 2025 18:23:06 +0800 Subject: [PATCH 029/197] [Bugfix] Fix pre-commit (#29601) Signed-off-by: DarkLight1337 --- .../ec_connector/integration/test_epd_correctness.py | 5 ++--- vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py | 11 +++++++---- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/tests/v1/ec_connector/integration/test_epd_correctness.py b/tests/v1/ec_connector/integration/test_epd_correctness.py index 69c4c58e349b9..616d34441ab8e 100644 --- a/tests/v1/ec_connector/integration/test_epd_correctness.py +++ b/tests/v1/ec_connector/integration/test_epd_correctness.py @@ -237,9 +237,8 @@ def main(): for i, prompt_data in enumerate(test_prompts): print( - f"\nRunning prompt {i + 1}/{len(test_prompts)}: { - prompt_data['description'] - }" + f"\nRunning prompt {i + 1}/{len(test_prompts)}: " + f"{prompt_data['description']}" ) output_str = run_chat_completion( diff --git a/vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py b/vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py index a6f50d68cc684..dcdeedda60a77 100644 --- a/vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py +++ b/vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py @@ -35,13 +35,16 @@ class EagleCudaGraphManager: self.compilation_config = vllm_config.compilation_config assert self.compilation_config is not None + cudagraph_mode: CUDAGraphMode if self.compilation_config.cudagraph_mode is None: - self.cudagraph_mode = CUDAGraphMode.NONE + cudagraph_mode = CUDAGraphMode.NONE else: - self.cudagraph_mode = self.compilation_config.cudagraph_mode - if self.cudagraph_mode == CUDAGraphMode.FULL: + cudagraph_mode = self.compilation_config.cudagraph_mode + if cudagraph_mode == CUDAGraphMode.FULL: # NOTE(woosuk): For Eagle, we only use CUDA graphs for decode. - self.cudagraph_mode = CUDAGraphMode.FULL_DECODE_ONLY + cudagraph_mode = CUDAGraphMode.FULL_DECODE_ONLY + + self.cudagraph_mode = cudagraph_mode self.cudagraph_sizes = get_cudagraph_sizes( self.compilation_config.cudagraph_capture_sizes, From a5abd1d38439a026607d641c594ca98829ea5623 Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Thu, 27 Nov 2025 19:33:19 +0800 Subject: [PATCH 030/197] [CI] Auto label CPU related issues (#29602) Signed-off-by: jiang1.li --- .github/workflows/issue_autolabel.yml | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/.github/workflows/issue_autolabel.yml b/.github/workflows/issue_autolabel.yml index 7d565ef9f2e45..a8251ceed07f4 100644 --- a/.github/workflows/issue_autolabel.yml +++ b/.github/workflows/issue_autolabel.yml @@ -105,6 +105,31 @@ jobs: } ], }, + cpu: { + // Keyword search - matches whole words only (with word boundaries) + keywords: [ + { + term: "[CPU]", + searchIn: "title" + }, + { + term: "x86", + searchIn: "title" + }, + { + term: "ARM", + searchIn: "title" + }, + { + term: "Apple Silicon", + searchIn: "title" + }, + { + term: "IBM Z", + searchIn: "title" + }, + ], + }, // Add more label configurations here as needed // example: { // keywords: [...], From cf348c8d27c34247f5976a86ebe6f4a3b4f9e888 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 27 Nov 2025 04:36:24 -0800 Subject: [PATCH 031/197] [Bugfix] Fix HunyuanVL XD-RoPE (#29593) Signed-off-by: Roger Wang Co-authored by: grider-transwithai --- vllm/model_executor/models/hunyuan_vision.py | 2 +- vllm/transformers_utils/processors/hunyuan_vl_image.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py index e83addd0c092f..2950db571e6ee 100644 --- a/vllm/model_executor/models/hunyuan_vision.py +++ b/vllm/model_executor/models/hunyuan_vision.py @@ -847,7 +847,7 @@ class HunYuanVLForConditionalGeneration( .expand(-1, llm_grid_w + 1) .reshape(-1) ) - h_index[pos : pos + token_num] = 0 + t_index[pos : pos + token_num] = image_index if xd_num == 4: llm_positions = torch.stack([p_index, w_index, h_index, t_index]) diff --git a/vllm/transformers_utils/processors/hunyuan_vl_image.py b/vllm/transformers_utils/processors/hunyuan_vl_image.py index 0a7e7865c783a..0b10ae249dbb6 100644 --- a/vllm/transformers_utils/processors/hunyuan_vl_image.py +++ b/vllm/transformers_utils/processors/hunyuan_vl_image.py @@ -195,9 +195,9 @@ class HunYuanVLImageProcessor(BaseImageProcessor): processed_images = [] for image in images: if do_resize: - resized_width, resized_height = smart_resize( - width, - height, + resized_height, resized_width = smart_resize( + height=height, + width=width, factor=patch_size * merge_size, min_pixels=self.min_pixels, max_pixels=self.max_pixels, From 2f5f9acd551cfb737997a1f7f86982ec74aabf79 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Thu, 27 Nov 2025 21:56:28 +0800 Subject: [PATCH 032/197] [LoRA] Continue optimizing MoE LoRA weight loading (#29322) Signed-off-by: Jee Jee Li --- tests/lora/test_lora_checkpoints.py | 15 +- tests/lora/test_lora_huggingface.py | 8 +- vllm/lora/layers/base.py | 2 +- vllm/lora/layers/column_parallel_linear.py | 16 +- vllm/lora/layers/fused_moe.py | 202 ++++++++++--------- vllm/lora/layers/logits_processor.py | 2 +- vllm/lora/layers/replicated_linear.py | 2 +- vllm/lora/layers/row_parallel_linear.py | 4 +- vllm/lora/layers/vocal_parallel_embedding.py | 2 +- vllm/lora/lora_weights.py | 53 +++++ vllm/lora/models.py | 50 ++--- vllm/lora/utils.py | 17 +- vllm/lora/worker_manager.py | 10 +- vllm/model_executor/models/interfaces.py | 1 + vllm/model_executor/models/qwen3_vl_moe.py | 1 + 15 files changed, 228 insertions(+), 157 deletions(-) diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py index 2219d470e91a1..b9b1bc59c6ed7 100644 --- a/tests/lora/test_lora_checkpoints.py +++ b/tests/lora/test_lora_checkpoints.py @@ -28,12 +28,13 @@ def test_load_checkpoints( packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping embedding_modules = BaiChuanBaseForCausalLM.embedding_modules embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules - expected_lora_modules: list[str] = [] + expected_lora_lst: list[str] = [] for module in BAICHUAN_LORA_MODULES: if module in packed_modules_mapping: - expected_lora_modules.extend(packed_modules_mapping[module]) + expected_lora_lst.extend(packed_modules_mapping[module]) else: - expected_lora_modules.append(module) + expected_lora_lst.append(module) + expected_lora_modules = set(expected_lora_lst) if lora_name == "baichuan7B": peft_helper = PEFTHelper.from_local_dir( baichuan_lora_files, max_position_embeddings=4096 @@ -103,13 +104,13 @@ def test_lora_weights_mapping(baichuan_lora_files): packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping embedding_modules = BaiChuanBaseForCausalLM.embedding_modules embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules - expected_lora_modules: list[str] = [] + expected_lora_lst: list[str] = [] for module in BAICHUAN_LORA_MODULES: if module in packed_modules_mapping: - expected_lora_modules.extend(packed_modules_mapping[module]) + expected_lora_lst.extend(packed_modules_mapping[module]) else: - expected_lora_modules.append(module) - + expected_lora_lst.append(module) + expected_lora_modules = set(expected_lora_lst) hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ "model.": "language_model.model.", diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py index 7d20faef541aa..6a787471c74fd 100644 --- a/tests/lora/test_lora_huggingface.py +++ b/tests/lora/test_lora_huggingface.py @@ -26,13 +26,13 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request): packed_modules_mapping = LlamaForCausalLM.packed_modules_mapping embedding_modules = LlamaForCausalLM.embedding_modules embed_padding_modules = LlamaForCausalLM.embedding_padding_modules - expected_lora_modules: list[str] = [] + expected_lora_lst: list[str] = [] for module in LLAMA_LORA_MODULES: if module in packed_modules_mapping: - expected_lora_modules.extend(packed_modules_mapping[module]) + expected_lora_lst.extend(packed_modules_mapping[module]) else: - expected_lora_modules.append(module) - + expected_lora_lst.append(module) + expected_lora_modules = set(expected_lora_lst) lora_path = get_adapter_absolute_path(lora_name) # lora loading should work for either absolute path and huggingface id. diff --git a/vllm/lora/layers/base.py b/vllm/lora/layers/base.py index 3bfb88c007622..a4b8fb4d2aec5 100644 --- a/vllm/lora/layers/base.py +++ b/vllm/lora/layers/base.py @@ -60,7 +60,7 @@ class BaseLayerWithLoRA(nn.Module): source_layer: nn.Module, lora_config: LoRAConfig, packed_modules_list: list, - model_config: PretrainedConfig | None, + model_config: PretrainedConfig | None = None, ) -> bool: """Returns True if the layer can be replaced by this LoRA layer.""" raise NotImplementedError diff --git a/vllm/lora/layers/column_parallel_linear.py b/vllm/lora/layers/column_parallel_linear.py index 3e21d426c304a..904025901fba7 100644 --- a/vllm/lora/layers/column_parallel_linear.py +++ b/vllm/lora/layers/column_parallel_linear.py @@ -153,7 +153,7 @@ class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA): source_layer: nn.Module, lora_config: LoRAConfig, packed_modules_list: list, - model_config: PretrainedConfig | None, + model_config: PretrainedConfig | None = None, ) -> bool: return type(source_layer) is ColumnParallelLinear or ( type(source_layer) is MergedColumnParallelLinear @@ -272,7 +272,7 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): source_layer: nn.Module, lora_config: LoRAConfig, packed_modules_list: list, - model_config: PretrainedConfig | None, + model_config: PretrainedConfig | None = None, ) -> bool: return ( type(source_layer) is MergedColumnParallelLinear @@ -338,7 +338,7 @@ class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): source_layer: nn.Module, lora_config: LoRAConfig, packed_modules_list: list, - model_config: PretrainedConfig | None, + model_config: PretrainedConfig | None = None, ) -> bool: return type(source_layer) is QKVParallelLinear and len(packed_modules_list) == 1 @@ -396,7 +396,7 @@ class MergedQKVParallelLinearWithLoRA(MergedColumnParallelLinearWithLoRA): source_layer: nn.Module, lora_config: LoRAConfig, packed_modules_list: list, - model_config: PretrainedConfig | None, + model_config: PretrainedConfig | None = None, ) -> bool: return type(source_layer) is QKVParallelLinear and len(packed_modules_list) == 3 @@ -434,7 +434,7 @@ class ColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithLoRA): source_layer: nn.Module, lora_config: LoRAConfig, packed_modules_list: list, - model_config: PretrainedConfig | None, + model_config: PretrainedConfig | None = None, ) -> bool: # specifying kwargs so they can be easily accessed in decorator return super().can_replace_layer( @@ -480,7 +480,7 @@ class MergedColumnParallelLinearWithShardedLoRA(MergedColumnParallelLinearWithLo source_layer: nn.Module, lora_config: LoRAConfig, packed_modules_list: list, - model_config: PretrainedConfig | None, + model_config: PretrainedConfig | None = None, ) -> bool: # specifying kwargs so they can be easily accessed in decorator return super().can_replace_layer( @@ -516,7 +516,7 @@ class QKVParallelLinearWithShardedLoRA(QKVParallelLinearWithLoRA): source_layer: nn.Module, lora_config: LoRAConfig, packed_modules_list: list, - model_config: PretrainedConfig | None, + model_config: PretrainedConfig | None = None, ) -> bool: # specifying kwargs so they can be easily accessed in decorator return super().can_replace_layer( @@ -565,7 +565,7 @@ class MergedQKVParallelLinearWithShardedLoRA(MergedQKVParallelLinearWithLoRA): source_layer: nn.Module, lora_config: LoRAConfig, packed_modules_list: list, - model_config: PretrainedConfig | None, + model_config: PretrainedConfig | None = None, ) -> bool: # specifying kwargs so they can be easily accessed in decorator return super().can_replace_layer( diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index 1b925742c3002..3ad19370962ab 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -401,6 +401,61 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): self.w13_lora_b_stacked[1][lora_id][experts_id] ) + def _slice_w13_a(self, w13_lora_a: torch.Tensor) -> torch.Tensor: + """ + Applies to FusedMoEWithLoRA and FusedMoE3DWithLoRA + """ + if self.tp_size == 1 or not self.fully_sharded: + return w13_lora_a + + # w13_lora_a shape (num_experts,rank,input_size) + current_lora_rank = w13_lora_a.shape[1] + assert current_lora_rank % self.tp_size == 0 + # Based on S-LoRA, we slice W13/W1/W3 A along the rank dim. + sliced_rank = current_lora_rank // self.tp_size + start_idx = self.tp_rank * sliced_rank + end_idx = (self.tp_rank + 1) * sliced_rank + return w13_lora_a[:, start_idx:end_idx, :] + + def _slice_w13_b(self, w13_lora_b: torch.Tensor): + if self.tp_size == 1: + return w13_lora_b + + # w13_lora_b shape (num_experts,output_size,rank) + shard_size = self.base_layer.intermediate_size_per_partition + start_idx = self.tp_rank * shard_size + end_idx = (self.tp_rank + 1) * shard_size + + return w13_lora_b[:, start_idx:end_idx, :] + + def _slice_w2_a(self, w2_lora_a: torch.Tensor) -> torch.Tensor: + """ + Applies to FusedMoEWithLoRA and FusedMoE3DWithLoRA + """ + if self.tp_size == 1: + return w2_lora_a + # w2_lora_a shape (num_experts,rank,input_size) + shard_size = self.base_layer.intermediate_size_per_partition + start_idx = self.tp_rank * shard_size + end_idx = (self.tp_rank + 1) * shard_size + + return w2_lora_a[:, :, start_idx:end_idx] + + def _slice_w2_b(self, w2_lora_b: torch.Tensor) -> torch.Tensor: + """ + Applies to FusedMoEWithLoRA and FusedMoE3DWithLoRA + """ + if self.tp_size == 1 or not self.fully_sharded: + return w2_lora_b + # Based on S-LoRA, we slice W2 B along the hidden_size dim. + # w2_lora_b shape (num_experts,output_size,rank) + current_lora_size = w2_lora_b.shape[1] + + sliced_size = current_lora_size // self.tp_size + start_idx = self.tp_rank * sliced_size + end_idx = (self.tp_rank + 1) * sliced_size + return w2_lora_b[:, start_idx:end_idx, :] + def reset_lora(self, index: int): """Resets the lora weights at index back to 0.""" for pos in range(self._w13_slices): @@ -411,6 +466,8 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): self.w2_lora_b_stacked[0][index] = 0 self.adapter_enabled[index] = 0 + # + def set_lora( self, index: int, @@ -418,69 +475,55 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): lora_b: torch.Tensor | list[torch.Tensor], ): """Overwrites lora tensors at index.""" + # Make mypy happy assert isinstance(lora_a, list) assert isinstance(lora_b, list) + self.reset_lora(index) self.adapter_enabled[index] = 1 - for eid in range(len(lora_a) // 3): - w1_lora_a = lora_a[eid * 3] - w2_lora_a = lora_a[eid * 3 + 1] - w3_lora_a = lora_a[eid * 3 + 2] - w1_lora_b = lora_b[eid * 3] - w2_lora_b = lora_b[eid * 3 + 1] - w3_lora_b = lora_b[eid * 3 + 2] - # Handle the case of adding LoRA to only a subset of experts - if w1_lora_a is None or w2_lora_a is None or w3_lora_a is None: - continue + num_experts = self.w13_lora_a_stacked[0].shape[1] - if self.tp_size > 1: - shard_size = self.base_layer.intermediate_size_per_partition - start_idx = self.tp_rank * shard_size - end_idx = (self.tp_rank + 1) * shard_size + w1_lora_a, w2_lora_a, w3_lora_a = lora_a + w1_lora_b, w2_lora_b, w3_lora_b = lora_b + assert ( + num_experts + == w1_lora_a.shape[0] + == w2_lora_a.shape[0] + == w3_lora_a.shape[0] + ) - w1_lora_b = w1_lora_b[start_idx:end_idx, :] - w3_lora_b = w3_lora_b[start_idx:end_idx, :] - w2_lora_a = w2_lora_a[:, start_idx:end_idx] + slliced_w1_lora_a = self._slice_w13_a(w1_lora_a) + slliced_w1_lora_b = self._slice_w13_b(w1_lora_b) + slliced_w3_lora_a = self._slice_w13_a(w3_lora_a) + slliced_w3_lora_b = self._slice_w13_b(w3_lora_b) - if self.fully_sharded: - # Based on S-LoRA, we slice W1 and W3 A along the rank dim, - # and W2 B along the hidden_size dim. - w13_shard_size = self.w13_lora_a_stacked[0][index, eid].shape[0] - w13_start_idx = self.tp_rank * w13_shard_size - w13_end_idx = (self.tp_rank + 1) * w13_shard_size - w1_lora_a = w1_lora_a[w13_start_idx:w13_end_idx, :] - w3_lora_a = w3_lora_a[w13_start_idx:w13_end_idx, :] + sliced_w2_lora_a = self._slice_w2_a(w2_lora_a) + sliced_w2_lora_b = self._slice_w2_b(w2_lora_b) - w2_shard_size = self.w2_lora_b_stacked[0][index, eid].shape[0] - w2_start_idx = self.tp_rank * w2_shard_size - w2_end_idx = (self.tp_rank + 1) * w2_shard_size - w2_lora_b = w2_lora_b[w2_start_idx:w2_end_idx, :] - # w1 lora_a - self.w13_lora_a_stacked[0][ - index, eid, : w1_lora_a.shape[0], : w1_lora_a.shape[1] - ].copy_(w1_lora_a, non_blocking=True) - # w3 lora_a - self.w13_lora_a_stacked[1][ - index, eid, : w3_lora_a.shape[0], : w3_lora_a.shape[1] - ].copy_(w3_lora_a, non_blocking=True) + self.w13_lora_a_stacked[0][ + index, :, : slliced_w1_lora_a.shape[1], : slliced_w1_lora_a.shape[2] + ].copy_(slliced_w1_lora_a, non_blocking=True) - # w1 lora_b - self.w13_lora_b_stacked[0][ - index, eid, : w1_lora_b.shape[0], : w1_lora_b.shape[1] - ].copy_(w1_lora_b, non_blocking=True) - # w3 lora_b - self.w13_lora_b_stacked[1][ - index, eid, : w3_lora_b.shape[0], : w3_lora_b.shape[1] - ].copy_(w3_lora_b, non_blocking=True) + self.w13_lora_a_stacked[1][ + index, :, : slliced_w3_lora_a.shape[1], : slliced_w3_lora_a.shape[2] + ].copy_(slliced_w3_lora_a, non_blocking=True) - self.w2_lora_a_stacked[0][ - index, eid, : w2_lora_a.shape[0], : w2_lora_a.shape[1] - ].copy_(w2_lora_a, non_blocking=True) + self.w13_lora_b_stacked[0][ + index, :, : slliced_w1_lora_b.shape[1], : slliced_w1_lora_b.shape[2] + ].copy_(slliced_w1_lora_b, non_blocking=True) - self.w2_lora_b_stacked[0][ - index, eid, : w2_lora_b.shape[0], : w2_lora_b.shape[1] - ].copy_(w2_lora_b, non_blocking=True) + self.w13_lora_b_stacked[1][ + index, :, : slliced_w3_lora_b.shape[1], : slliced_w3_lora_b.shape[2] + ].copy_(slliced_w3_lora_b, non_blocking=True) + + self.w2_lora_a_stacked[0][ + index, :, : sliced_w2_lora_a.shape[1], : sliced_w2_lora_a.shape[2] + ].copy_(sliced_w2_lora_a, non_blocking=True) + + self.w2_lora_b_stacked[0][ + index, :, : sliced_w2_lora_b.shape[1], : sliced_w2_lora_b.shape[2] + ].copy_(sliced_w2_lora_b, non_blocking=True) def forward(self, *args, **kwargs): return self.base_layer.forward(*args, **kwargs) @@ -506,12 +549,12 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): source_layer: nn.Module, lora_config: LoRAConfig, packed_modules_list: list, - model_config: PretrainedConfig | None, + model_config: PretrainedConfig | None = None, ) -> bool: """Returns True if the layer can be replaced by this LoRA layer.""" - # return type(source_layer) is FusedMoE - return type(source_layer) is FusedMoE and len(packed_modules_list) == 2 + # source_layer is FusedMoE or SharedFusedMoE + return isinstance(source_layer, FusedMoE) and len(packed_modules_list) == 2 class FusedMoE3DWithLoRA(FusedMoEWithLoRA): @@ -555,6 +598,9 @@ class FusedMoE3DWithLoRA(FusedMoEWithLoRA): model_config: PretrainedConfig | None = None, ) -> None: """Initializes lora matrices.""" + + assert isinstance(model_config, PretrainedConfig) + self._base_model = model_config.architectures[0] self.max_loras = lora_config.max_loras self.fully_sharded = lora_config.fully_sharded_loras @@ -565,20 +611,7 @@ class FusedMoE3DWithLoRA(FusedMoEWithLoRA): self._create_lora_a_weights(max_loras, lora_config) self._create_lora_b_weights(max_loras, lora_config) - def _slice_w13_a(self, w13_lora_a: torch.Tensor) -> torch.Tensor: - if self.tp_size == 1 or not self.fully_sharded: - return w13_lora_a - - # w13_lora_a shape (num_experts,rank,input_size) - current_lora_rank = w13_lora_a.shape[1] - assert current_lora_rank % self.tp_size == 0 - - sliced_rank = current_lora_rank // self.tp_size - start_idx = self.tp_rank * sliced_rank - end_idx = (self.tp_rank + 1) * sliced_rank - return w13_lora_a[:, start_idx:end_idx, :] - - def _slice_w13_b(self, w13_lora_b: torch.Tensor, is_interleave: bool = True): + def _slice_w13_b(self, w13_lora_b: torch.Tensor): if self.tp_size == 1: return w13_lora_b @@ -586,7 +619,8 @@ class FusedMoE3DWithLoRA(FusedMoEWithLoRA): shard_size = self.base_layer.intermediate_size_per_partition start_idx = self.tp_rank * shard_size end_idx = (self.tp_rank + 1) * shard_size - if is_interleave: + # HACK: Currently, only GPT-OSS is in interleaved order + if self._base_model == "GptOssForCausalLM": # For models like GPT-OSS, the weights of w1 (gate_proj) and w3 (up_proj) # in the interleaved order, and corresponding LoRA need to be processed. w1_lora_b = w13_lora_b[:, ::2, :] @@ -606,28 +640,6 @@ class FusedMoE3DWithLoRA(FusedMoEWithLoRA): return torch.cat([sliced_w1_lora_b, sliced_w3_lora_b], dim=1) - def _slice_w2_a(self, w2_lora_a: torch.Tensor) -> torch.Tensor: - if self.tp_size == 1: - return w2_lora_a - # w2_lora_a shape (num_experts,rank,input_size) - shard_size = self.base_layer.intermediate_size_per_partition - start_idx = self.tp_rank * shard_size - end_idx = (self.tp_rank + 1) * shard_size - - return w2_lora_a[:, :, start_idx:end_idx] - - def _slice_w2_b(self, w2_lora_b: torch.Tensor) -> torch.Tensor: - if self.tp_size == 1 or not self.fully_sharded: - return w2_lora_b - # Based on S-LoRA, we slice W2 B along the hidden_size dim. - # w2_lora_b shape (num_experts,output_size,rank) - current_lora_size = w2_lora_b.shape[1] - - sliced_size = current_lora_size // self.tp_size - start_idx = self.tp_rank * sliced_size - end_idx = (self.tp_rank + 1) * sliced_size - return w2_lora_b[:, start_idx:end_idx, :] - def set_lora( self, index: int, @@ -658,7 +670,7 @@ class FusedMoE3DWithLoRA(FusedMoEWithLoRA): w2_lora_b = w2_lora_b.permute(1, 0, 2) sliced_w13_lora_a = self._slice_w13_a(w13_lora_a) - sliced_w13_lora_b = self._slice_w13_b(w13_lora_b, is_interleave=True) + sliced_w13_lora_b = self._slice_w13_b(w13_lora_b) sliced_w2_lora_a = self._slice_w2_a(w2_lora_a) sliced_w2_lora_b = self._slice_w2_b(w2_lora_b) @@ -711,8 +723,8 @@ class FusedMoE3DWithLoRA(FusedMoEWithLoRA): source_layer: nn.Module, lora_config: LoRAConfig, packed_modules_list: list, - model_config: PretrainedConfig | None, + model_config: PretrainedConfig | None = None, ) -> bool: """Returns True if the layer can be replaced by this LoRA layer.""" - - return type(source_layer) is FusedMoE and len(packed_modules_list) == 1 + # source_layer is FusedMoE or SharedFusedMoE + return isinstance(source_layer, FusedMoE) and len(packed_modules_list) == 1 diff --git a/vllm/lora/layers/logits_processor.py b/vllm/lora/layers/logits_processor.py index c01984db4e64c..01515f6136371 100644 --- a/vllm/lora/layers/logits_processor.py +++ b/vllm/lora/layers/logits_processor.py @@ -197,7 +197,7 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA): source_layer: nn.Module, lora_config: LoRAConfig, packed_modules_list: list, - model_config: PretrainedConfig | None, + model_config: PretrainedConfig | None = None, ) -> bool: # Special handling for the LogitsProcessor. return False diff --git a/vllm/lora/layers/replicated_linear.py b/vllm/lora/layers/replicated_linear.py index 243736c4ebc65..62bac546ccd1a 100644 --- a/vllm/lora/layers/replicated_linear.py +++ b/vllm/lora/layers/replicated_linear.py @@ -53,7 +53,7 @@ class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA): source_layer: nn.Module, lora_config: LoRAConfig, packed_modules_list: list, - model_config: PretrainedConfig | None, + model_config: PretrainedConfig | None = None, ) -> bool: return type(source_layer) is ReplicatedLinear diff --git a/vllm/lora/layers/row_parallel_linear.py b/vllm/lora/layers/row_parallel_linear.py index 95517b1aee263..958aa6af36746 100644 --- a/vllm/lora/layers/row_parallel_linear.py +++ b/vllm/lora/layers/row_parallel_linear.py @@ -87,7 +87,7 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA): source_layer: nn.Module, lora_config: LoRAConfig, packed_modules_list: list, - model_config: PretrainedConfig | None, + model_config: PretrainedConfig | None = None, ) -> bool: return type(source_layer) is RowParallelLinear @@ -164,7 +164,7 @@ class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA): source_layer: nn.Module, lora_config: LoRAConfig, packed_modules_list: list, - model_config: PretrainedConfig | None, + model_config: PretrainedConfig | None = None, ) -> bool: # specifying kwargs so they can be easily accessed in decorator return super().can_replace_layer( diff --git a/vllm/lora/layers/vocal_parallel_embedding.py b/vllm/lora/layers/vocal_parallel_embedding.py index c87ca9e24dece..4c1550d09e5e2 100644 --- a/vllm/lora/layers/vocal_parallel_embedding.py +++ b/vllm/lora/layers/vocal_parallel_embedding.py @@ -131,7 +131,7 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA): source_layer: nn.Module, lora_config: LoRAConfig, packed_modules_list: list, - model_config: PretrainedConfig | None, + model_config: PretrainedConfig | None = None, ) -> bool: return type(source_layer) is VocabParallelEmbedding diff --git a/vllm/lora/lora_weights.py b/vllm/lora/lora_weights.py index f0d8e22194050..15c4a1be63eeb 100644 --- a/vllm/lora/lora_weights.py +++ b/vllm/lora/lora_weights.py @@ -152,6 +152,59 @@ class PackedLoRALayerWeights(LoRALayerWeights): ) return obj + @classmethod + def pack_moe( + cls, loras: GenericSequence[Optional["LoRALayerWeights"]], module_name: str + ) -> "PackedLoRALayerWeights": + """Pack a list of LoRAs into a single LoRA. + + If LoRA is None, it signifies that the submodule does not have a LoRA. + """ + + first_lora = next(lora for lora in loras if lora is not None) + assert first_lora is not None + rank = first_lora.rank + lora_alpha = first_lora.lora_alpha + assert len(loras) % 3 == 0 + w1_lora_a_lst = [] + w2_lora_a_lst = [] + w3_lora_a_lst = [] + w1_lora_b_lst = [] + w2_lora_b_lst = [] + w3_lora_b_lst = [] + # TODO: Consider the case where some experts don't have LoRA added. + for eid in range(len(loras) // 3): + w1_lora = loras[eid * 3] + w2_lora = loras[eid * 3 + 1] + w3_lora = loras[eid * 3 + 2] + assert w1_lora is not None + assert w2_lora is not None + assert w3_lora is not None + + w1_lora_a_lst.append(w1_lora.lora_a) + w2_lora_a_lst.append(w2_lora.lora_a) + w3_lora_a_lst.append(w3_lora.lora_a) + + w1_lora_b_lst.append(w1_lora.lora_b) + w2_lora_b_lst.append(w2_lora.lora_b) + w3_lora_b_lst.append(w3_lora.lora_b) + + w1_lora_a = torch.stack(w1_lora_a_lst, dim=0) # (num_experts,rank,input_size) + w2_lora_a = torch.stack(w2_lora_a_lst, dim=0) + w3_lora_a = torch.stack(w3_lora_a_lst, dim=0) + w1_lora_b = torch.stack(w1_lora_b_lst, dim=0) # (num_experts,output_size,rank) + w2_lora_b = torch.stack(w2_lora_b_lst, dim=0) + w3_lora_b = torch.stack(w3_lora_b_lst, dim=0) + + obj = cls( + module_name, + rank, + [lora_alpha, lora_alpha, lora_alpha], + [w1_lora_a, w2_lora_a, w3_lora_a], + [w1_lora_b, w2_lora_b, w3_lora_b], + ) + return obj + def optimize(self) -> "PackedLoRALayerWeights": """Optimize the LoRA by merging the scaling into lora_b.""" for i in range(len(self.lora_b)): diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 636f062feb7b0..4caaf0e117cc4 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -13,7 +13,7 @@ from torch import nn from vllm.config.lora import LoRAConfig from vllm.logger import init_logger -from vllm.lora.layers import BaseLayerWithLoRA, FusedMoEWithLoRA, LoRAMapping +from vllm.lora.layers import BaseLayerWithLoRA, FusedMoE3DWithLoRA, LoRAMapping from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights from vllm.lora.peft_helper import PEFTHelper from vllm.lora.punica_wrapper import get_punica_wrapper @@ -151,16 +151,13 @@ class LoRAModel: if pin_memory: loras[module_name].lora_b = loras[module_name].lora_b.pin_memory() - for lora in loras.values(): - lora.optimize() - return cls(lora_model_id, peft_helper.r, loras) @classmethod def from_local_checkpoint( cls, lora_dir: str, - expected_lora_modules: list[str], + expected_lora_modules: set[str], peft_helper: PEFTHelper, *, lora_model_id: int | None = None, @@ -190,10 +187,7 @@ class LoRAModel: lora_tensor_path = os.path.join(lora_dir, "adapter_model.safetensors") lora_bin_file_path = os.path.join(lora_dir, "adapter_model.bin") lora_pt_file_path = os.path.join(lora_dir, "adapter_model.pt") - # new_embeddings_tensor_path = os.path.join( - # lora_dir, "new_embeddings.safetensors" - # ) - # new_embeddings_bin_file_path = os.path.join(lora_dir, "new_embeddings.bin") + tensors: dict[str, torch.Tensor] = {} unexpected_modules: list[list[str] | str] = [] @@ -201,18 +195,19 @@ class LoRAModel: for lora_module in modules.keys(): # noqa if is_base_embeddding_weights(lora_module): continue - module_name, _ = parse_fine_tuned_lora_name(lora_module, weights_mapper) - # Handle FSDP file format where experts.base_layer is the + # Handle PEFT file format where experts.base_layer is the # gate_up_proj and experts is the down_proj if "base_layer" in lora_module: continue + module_name, _ = parse_fine_tuned_lora_name(lora_module, weights_mapper) # Case for expert lora weights if ".experts" in module_name: - if not any( - module_name.endswith(ele) for ele in expected_lora_modules - ): + expert_idx = module_name.find(".experts") + expert_suffix = module_name[expert_idx + 1 :] + if expert_suffix not in expected_lora_modules: unexpected_modules.append(module_name) - elif module_name.split(".")[-1] not in expected_lora_modules: + + elif module_name.rsplit(".", 1)[-1] not in expected_lora_modules: unexpected_modules.append(module_name) if unexpected_modules: @@ -358,9 +353,7 @@ class LoRAModelManager: self.modules: dict[str, BaseLayerWithLoRA] = {} # Dict instead of a set for compatibility with LRUCache. self._last_mapping: LoRAMapping | None = None - self._is_3d_moe_model = is_moe_model(self.model) and hasattr( - self.model, "is_3d_moe_weight" - ) + self._is_3d_moe_model = is_moe_model(self.model) and self.model.is_3d_moe_weight self._create_lora_modules() self.model.lora_manager = self @@ -411,7 +404,7 @@ class LoRAModelManager: continue # Note (gnovack) - If MOE lora weights are not split into # num_experts chunks, we split them here - if isinstance(module, FusedMoEWithLoRA) and torch.is_tensor( + if isinstance(module, FusedMoE3DWithLoRA) and torch.is_tensor( module_lora.lora_a ): # Handle PEFT file format where experts.base_layer is the @@ -679,7 +672,10 @@ class LoRAModelManager: "cpu", ) subloras.append(lora) - lora = PackedLoRALayerWeights.pack(subloras) + if module.__class__.__name__ == "FusedMoEWithLoRA": + lora = PackedLoRALayerWeights.pack_moe(subloras, module_name) + else: + lora = PackedLoRALayerWeights.pack(subloras) model.loras[module_name] = lora return model @@ -739,13 +735,21 @@ class LoRAModelManager: replaced_module_name = module_name.replace("model.", "") if lora_model.check_lora_name(module_name): module_name = replaced_module_name - lora_model.loras[module_name] = PackedLoRALayerWeights.pack( - replacement_loras - ) + if module_name.endswith(".experts"): + lora_model.loras[module_name] = PackedLoRALayerWeights.pack_moe( + replacement_loras, module_name + ) + else: + lora_model.loras[module_name] = PackedLoRALayerWeights.pack( + replacement_loras + ) # Remove the modules that have been replaced. for module in replaced_module: lora_model.loras.pop(module, None) + for lora in lora_model.loras.values(): + lora.optimize() + def _get_lora_layer_weights( self, lora_model: LoRAModel, module_name: str ) -> LoRALayerWeights | None: diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index 12524994d4968..47484b2b984df 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -170,16 +170,15 @@ def parse_fine_tuned_lora_name( def is_base_embeddding_weights(name: str) -> bool: # hardcoded subfixes for input & output embedding weights - input_embedding_subfix = ".embed_tokens.base_layer.weight" - output_embedding_subfix = ".lm_head.base_layer.weight" - - return name.endswith(input_embedding_subfix) or name.endswith( - output_embedding_subfix + embedding_suffixes = ( + ".embed_tokens.base_layer.weight", + ".lm_head.base_layer.weight", ) + return name.endswith(embedding_suffixes) def is_regex_target_modules( - load_modules: str | list[str], expected_lora_modules: list[str] + load_modules: str | list[str], expected_lora_modules: set[str] ) -> bool: """ PEFT supports passing `target_modules` in the form of regular expressions, @@ -195,8 +194,8 @@ def is_regex_target_modules( except re.error: return False - def is_subset(sub_list, full_list): - return set(sub_list).issubset(set(full_list)) + def is_subset(sub_list, full_set): + return set(sub_list).issubset(full_set) # Similar to PEFT's processing logic, regex-related operations are only # executed when the load_modules is a `str`. @@ -290,7 +289,7 @@ def process_packed_modules_mapping(model: nn.Module) -> dict[str, list[str]]: # the expert indices are expanded based on the configured number # of routed experts. packed_modules_mapping = get_packed_modules_mapping(model) - if not hasattr(model, "is_3d_moe_weight"): + if not model.is_3d_moe_weight: # 3D MoE LoRA does not need `packed_modules_mapping` packed_modules_mapping["experts"] = [ weight_name.rstrip(".") diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 4cc201a6414f1..d9a03f0500497 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -88,15 +88,15 @@ class WorkerLoRAManager: try: supported_lora_modules = self._adapter_manager.supported_lora_modules packed_modules_mapping = self._adapter_manager.packed_modules_mapping - expected_lora_modules: list[str] = [] + expected_lora_lst: list[str] = [] for module in supported_lora_modules: if module in packed_modules_mapping: - expected_lora_modules.extend(packed_modules_mapping[module]) + expected_lora_lst.extend(packed_modules_mapping[module]) else: - expected_lora_modules.append(module) + expected_lora_lst.append(module) if module == "experts": - expected_lora_modules.append(module) - expected_lora_modules = list(set(expected_lora_modules)) + expected_lora_lst.append(module) + expected_lora_modules = set(expected_lora_lst) lora_path = get_adapter_absolute_path(lora_request.lora_path) peft_helper = PEFTHelper.from_local_dir( diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 6f6ce32538b71..cee0b79e5e5ac 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -336,6 +336,7 @@ class SupportsLoRA(Protocol): There is no need to redefine this flag if this class is in the MRO of your model class. """ + is_3d_moe_weight: ClassVar[bool] = False # The `embedding_module` and `embedding_padding_modules` # are empty by default. embedding_modules: ClassVar[dict[str, str]] = {} diff --git a/vllm/model_executor/models/qwen3_vl_moe.py b/vllm/model_executor/models/qwen3_vl_moe.py index e2c129120b1a5..a054bd5b3831e 100644 --- a/vllm/model_executor/models/qwen3_vl_moe.py +++ b/vllm/model_executor/models/qwen3_vl_moe.py @@ -401,6 +401,7 @@ class Qwen3VLMoeMixtureOfExperts(MixtureOfExperts): class Qwen3VLMoeForConditionalGeneration( Qwen3VLForConditionalGeneration, Qwen3VLMoeMixtureOfExperts ): + is_3d_moe_weight: bool = True packed_modules_mapping = { "qkv_proj": [ "q_proj", From 882851dc817061de52c949ac27b11442e5529caa Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Thu, 27 Nov 2025 22:51:26 +0800 Subject: [PATCH 033/197] [CI/Build][Bugfix] Fix auto label issues for CPU (#29610) Signed-off-by: jiang1.li --- .github/workflows/issue_autolabel.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/issue_autolabel.yml b/.github/workflows/issue_autolabel.yml index a8251ceed07f4..629966b959330 100644 --- a/.github/workflows/issue_autolabel.yml +++ b/.github/workflows/issue_autolabel.yml @@ -109,7 +109,7 @@ jobs: // Keyword search - matches whole words only (with word boundaries) keywords: [ { - term: "[CPU]", + term: "CPU Backend", searchIn: "title" }, { From bab438ff3e7bd93f861e66a60c6cbefe42af0d1a Mon Sep 17 00:00:00 2001 From: Ryan Rock Date: Thu, 27 Nov 2025 09:01:37 -0600 Subject: [PATCH 034/197] [CI/Build] Skip ray tests on ROCm (#29556) Signed-off-by: Ryan Rock --- tests/v1/distributed/test_async_llm_dp.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/v1/distributed/test_async_llm_dp.py b/tests/v1/distributed/test_async_llm_dp.py index 60f9017184ea0..3b5f2e5e8d72f 100644 --- a/tests/v1/distributed/test_async_llm_dp.py +++ b/tests/v1/distributed/test_async_llm_dp.py @@ -12,6 +12,7 @@ from vllm import SamplingParams from vllm.config import VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.inputs import PromptType +from vllm.platforms import current_platform from vllm.sampling_params import RequestOutputKind from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.core_client import DPAsyncMPClient @@ -84,6 +85,10 @@ async def test_load( if async_scheduling and data_parallel_backend == "ray": # TODO(NickLucche) Re-enable when async scheduling is supported pytest.skip("Async scheduling is not supported with ray") + elif data_parallel_backend == "ray" and current_platform.is_rocm(): + pytest.skip( + "Ray as the distributed executor backend is not supported with ROCm." + ) stats_loggers = {} @dataclass From 66d3d5422c9b90f1ee9593e1793e86f14e4eb3f4 Mon Sep 17 00:00:00 2001 From: Didier Durand <2927957+didier-durand@users.noreply.github.com> Date: Thu, 27 Nov 2025 16:15:50 +0100 Subject: [PATCH 035/197] [Doc]: fixing typos in diverse files (#29492) Signed-off-by: Didier Durand --- vllm/benchmarks/serve.py | 4 ++-- vllm/config/parallel.py | 4 ++-- vllm/lora/punica_wrapper/punica_base.py | 2 +- vllm/model_executor/models/adapters.py | 4 ++-- vllm/v1/sample/tpu/sampler.py | 2 +- vllm/v1/worker/dp_utils.py | 6 ++++-- 6 files changed, 12 insertions(+), 10 deletions(-) diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index dddb050ec180e..519303c0bfa0a 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -1005,7 +1005,7 @@ def add_cli_args(parser: argparse.ArgumentParser): help="Key-value pairs (e.g, --header x-additional-info=0.3.3) " "for headers to be passed with each request. These headers override " "per backend constants and values set via environment variable, and " - "will be overriden by other arguments (such as request ids).", + "will be overridden by other arguments (such as request ids).", ) parser.add_argument( "--max-concurrency", @@ -1138,7 +1138,7 @@ def add_cli_args(parser: argparse.ArgumentParser): "--percentile-metrics", type=str, default=None, - help="Comma-separated list of selected metrics to report percentils. " + help="Comma-separated list of selected metrics to report percentiles. " "This argument specifies the metrics to report percentiles. " 'Allowed metric names are "ttft", "tpot", "itl", "e2el". ' 'If not specified, defaults to "ttft,tpot,itl" for generative models ' diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 7ba1da5db3849..4a8c8bc17cfc3 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -238,9 +238,9 @@ class ParallelConfig: cp_kv_cache_interleave_size: int = 1 """Interleave size of kv_cache storage while using DCP or PCP. For `total_cp_rank = pcp_rank * dcp_world_size + dcp_rank`, - and `total_cp_world_size = pcp_world_size * dcp_world_szie`. + and `total_cp_world_size = pcp_world_size * dcp_world_size`. store interleave_size tokens on total_cp_rank i, - then store next interleave_size tokens on taotal_cp_rank i+1. + then store next interleave_size tokens on total_cp_rank i+1. Interleave_size=1: token-level alignment, where token `i` is stored on total_cp_rank `i % total_cp_world_size`. Interleave_size=block_size: block-level alignment, where tokens are diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py index ce38751e4b6a7..47c42b095534a 100644 --- a/vllm/lora/punica_wrapper/punica_base.py +++ b/vllm/lora/punica_wrapper/punica_base.py @@ -173,7 +173,7 @@ class PunicaWrapperBase(PunicaWrapperABC): vocab_size: int, ): # NOTE We have remove lora extra vocab support for now. So we set - # extra_vocab_size alwayzs to 0, and extra_vocab_size will be removed. + # extra_vocab_size always to 0, and extra_vocab_size will be removed. extra_vocab_size = 0 ( diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index a9cc49451a1d3..5aba46f8614be 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -428,7 +428,7 @@ def load_weights_using_from_2_way_softmax( ) if text_config.tie_word_embeddings: # embed_tokens is the assumed name for input embeddings. If the model does not - # have this attribute, we fallback to get_input_embeddings(), which is used by + # have this attribute, we fall back to get_input_embeddings(), which is used by # the Transformers modeling backend. embed_tokens = ( model.model.embed_tokens @@ -486,7 +486,7 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te ) if text_config.tie_word_embeddings: # embed_tokens is the assumed name for input embeddings. If the model does not - # have this attribute, we fallback to get_input_embeddings(), which is used by + # have this attribute, we fall back to get_input_embeddings(), which is used by # the Transformers modeling backend. embed_tokens = ( model.model.embed_tokens diff --git a/vllm/v1/sample/tpu/sampler.py b/vllm/v1/sample/tpu/sampler.py index 8f0463c76ce15..6d992bb37a59d 100644 --- a/vllm/v1/sample/tpu/sampler.py +++ b/vllm/v1/sample/tpu/sampler.py @@ -181,7 +181,7 @@ def apply_top_k_top_p( after thresholding the logit using this cut-off, the remaining elements shall constitute the top-p set. - Note: in the case of tie (i.e. multipple cut-off elements present in the + Note: in the case of tie (i.e. multiple cut-off elements present in the logit), all tie elements are included in the top-p set. In other words, this function does not break ties. Instead, these tie tokens have equal chance of being chosen during final sampling, so we can consider the tie diff --git a/vllm/v1/worker/dp_utils.py b/vllm/v1/worker/dp_utils.py index 064f2f0360cbf..c1509de821b05 100644 --- a/vllm/v1/worker/dp_utils.py +++ b/vllm/v1/worker/dp_utils.py @@ -24,12 +24,14 @@ def _get_device_and_group(parallel_config: ParallelConfig): device = get_dp_group().device group = get_dp_group().device_group - # Transfering this tensor from GPU to CPU will introduce a GPU sync + # Transferring this tensor from GPU to CPU will introduce a GPU sync # point that could adversely affect performance of vllm with asynch # scheduling. This environment variable exists to quickly disable # this optimization if we run into this case. if parallel_config.disable_nccl_for_dp_synchronization: - logger.info_once("Using CPU all reduce to syncronize DP padding between ranks.") + logger.info_once( + "Using CPU all reduce to synchronize DP padding between ranks." + ) device = "cpu" group = get_dp_group().cpu_group return device, group From cd007a53b4a2d7a83e35de559dc87da09302e956 Mon Sep 17 00:00:00 2001 From: Mathis Felardos Date: Thu, 27 Nov 2025 16:32:38 +0100 Subject: [PATCH 036/197] [bugfix] avoid NIXL_ERR_REMOTE_DISCONNECT in nixl_connector when Prefill dies (#28120) Signed-off-by: Mathis Felardos --- .../kv_connector/v1/nixl_connector.py | 62 ++++++++++++------- 1 file changed, 41 insertions(+), 21 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index ff51840b84b14..d5edf84e8e7f1 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -1832,35 +1832,55 @@ class NixlConnectorWorker: done_req_ids: set[str] = set() for req_id, handles in list(transfers.items()): in_progress = False - for handle, _xfer_stime in handles: - xfer_state = self.nixl_wrapper.check_xfer_state(handle) - if xfer_state == "DONE": - # Get telemetry from NIXL - res = self.nixl_wrapper.get_xfer_telemetry(handle) - self.xfer_stats.record_transfer(res) - self.nixl_wrapper.release_xfer_handle(handle) - elif xfer_state == "PROC": - in_progress = True - continue - else: - # transfer failed - mark blocks as invalid - logger.error( - "NIXL transfer failed for request %s with state %s. " + for handle, xfer_start_time in handles: + try: + xfer_state = self.nixl_wrapper.check_xfer_state(handle) + if xfer_state == "DONE": + # Get telemetry from NIXL + res = self.nixl_wrapper.get_xfer_telemetry(handle) + self.xfer_stats.record_transfer(res) + self.nixl_wrapper.release_xfer_handle(handle) + elif xfer_state == "PROC": + in_progress = True + continue + else: + logger.error( + "NIXL transfer failed for request %s with state " + "%s. Marking blocks as invalid.", + req_id, + xfer_state, + ) + self._handle_failed_transfer(req_id, handle) + in_progress = False + except Exception: + logger.exception( + "NIXL transfer exception for request %s. " "Marking blocks as invalid.", req_id, - xfer_state, ) - # mark all (logical)blocks for this request as invalid - if meta := self._recving_metadata.pop(req_id, None): - self._invalid_block_ids.update(meta.local_block_ids) - self._recving_metadata.pop(req_id, None) - self.nixl_wrapper.release_xfer_handle(handle) - self.xfer_stats.record_failed_transfer() + self._handle_failed_transfer(req_id, handle) + in_progress = False + if not in_progress: done_req_ids.add(req_id) del transfers[req_id] return done_req_ids + def _handle_failed_transfer(self, req_id: str, handle: int): + """ + Handle a failed transfer by marking all (logical) blocks as invalid and + recording the failure. + + Args: + req_id: The request ID. + handle: The transfer handle. + """ + if meta := self._recving_metadata.pop(req_id, None): + self._invalid_block_ids.update(meta.local_block_ids) + self._recving_metadata.pop(req_id, None) + self.nixl_wrapper.release_xfer_handle(handle) + self.xfer_stats.record_failed_transfer() + def start_load_kv(self, metadata: NixlConnectorMetadata): """ Start loading by triggering non-blocking nixl_xfer. From fc1d8be3dc97e33ade7fb578451006bb044a5e60 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Thu, 27 Nov 2025 11:19:09 -0500 Subject: [PATCH 037/197] [Attention] Update attention imports (#29540) Signed-off-by: Matthew Bonanni --- .../test_rocm_attention_backends_selection.py | 9 +++------ .../kv_connector/unit/test_backwards_compatibility.py | 6 +++--- vllm/attention/backends/abstract.py | 11 ++++------- vllm/attention/layers/chunked_local_attention.py | 3 +-- vllm/config/model.py | 3 +-- vllm/config/multimodal.py | 11 ++--------- vllm/distributed/kv_transfer/kv_connector/v1/base.py | 4 ++-- .../kv_connector/v1/decode_bench_connector.py | 4 ++-- .../kv_transfer/kv_connector/v1/lmcache_connector.py | 4 ++-- .../v1/lmcache_integration/vllm_v1_adapter.py | 4 ++-- .../kv_connector/v1/lmcache_mp_connector.py | 4 ++-- .../kv_transfer/kv_connector/v1/multi_connector.py | 4 ++-- .../kv_transfer/kv_connector/v1/nixl_connector.py | 5 ++--- .../kv_connector/v1/p2p/p2p_nccl_connector.py | 4 ++-- .../kv_connector/v1/shared_storage_connector.py | 4 ++-- vllm/forward_context.py | 8 +++----- vllm/model_executor/layers/attention_layer_base.py | 7 ++----- vllm/model_executor/layers/mamba/abstract.py | 7 ++----- .../compressed_tensors/compressed_tensors.py | 3 +-- vllm/model_executor/layers/quantization/fp8.py | 4 +--- vllm/model_executor/layers/quantization/modelopt.py | 3 +-- vllm/model_executor/layers/quantization/mxfp4.py | 3 +-- vllm/model_executor/layers/quantization/petit.py | 3 +-- vllm/model_executor/layers/quantization/ptpc_fp8.py | 3 +-- .../model_executor/layers/quantization/quark/quark.py | 3 +-- vllm/platforms/cpu.py | 5 +---- vllm/platforms/cuda.py | 10 ++-------- vllm/platforms/interface.py | 5 +---- vllm/platforms/rocm.py | 6 +----- vllm/platforms/tpu.py | 5 +---- vllm/platforms/xpu.py | 7 +------ vllm/v1/attention/backends/cpu_attn.py | 2 -- vllm/v1/attention/backends/flash_attn.py | 2 -- vllm/v1/attention/backends/flex_attention.py | 2 -- vllm/v1/attention/backends/utils.py | 7 +++++-- vllm/v1/kv_offload/spec.py | 4 ++-- vllm/v1/spec_decode/eagle.py | 3 +-- vllm/v1/worker/utils.py | 7 ++----- 38 files changed, 63 insertions(+), 126 deletions(-) diff --git a/tests/v1/attention/test_rocm_attention_backends_selection.py b/tests/v1/attention/test_rocm_attention_backends_selection.py index 80158d4b7278c..77790be6f892b 100644 --- a/tests/v1/attention/test_rocm_attention_backends_selection.py +++ b/tests/v1/attention/test_rocm_attention_backends_selection.py @@ -139,14 +139,13 @@ def test_standard_attention_backend_selection( import importlib import vllm.envs as envs - from vllm.attention.backends.registry import _Backend importlib.reload(envs) # Convert string backend to enum if provided backend_enum = None if selected_backend: - backend_enum = getattr(_Backend, selected_backend) + backend_enum = getattr(AttentionBackendEnum, selected_backend) # Get the backend class path from vllm.platforms.rocm import RocmPlatform @@ -253,7 +252,6 @@ def test_mla_backend_selection( import importlib import vllm.envs as envs - from vllm.attention.backends.registry import _Backend importlib.reload(envs) @@ -269,7 +267,7 @@ def test_mla_backend_selection( # Convert string backend to enum if provided backend_enum = None if selected_backend: - backend_enum = getattr(_Backend, selected_backend) + backend_enum = getattr(AttentionBackendEnum, selected_backend) from vllm.platforms.rocm import RocmPlatform @@ -301,7 +299,6 @@ def test_mla_backend_selection( def test_aiter_fa_requires_gfx9(mock_vllm_config): """Test that ROCM_AITER_FA requires gfx9 architecture.""" - from vllm.attention.backends.registry import _Backend from vllm.platforms.rocm import RocmPlatform # Mock on_gfx9 to return False @@ -313,7 +310,7 @@ def test_aiter_fa_requires_gfx9(mock_vllm_config): ), ): RocmPlatform.get_attn_backend_cls( - selected_backend=_Backend.ROCM_AITER_FA, + selected_backend=AttentionBackendEnum.ROCM_AITER_FA, head_size=128, dtype=torch.float16, kv_cache_dtype="auto", diff --git a/tests/v1/kv_connector/unit/test_backwards_compatibility.py b/tests/v1/kv_connector/unit/test_backwards_compatibility.py index f51001a6ec12a..7cd23805c599d 100644 --- a/tests/v1/kv_connector/unit/test_backwards_compatibility.py +++ b/tests/v1/kv_connector/unit/test_backwards_compatibility.py @@ -14,6 +14,7 @@ from unittest.mock import patch import pytest +from vllm.attention.backends.abstract import AttentionMetadata from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory from vllm.distributed.kv_transfer.kv_connector.v1 import ( KVConnectorBase_V1, @@ -24,7 +25,6 @@ from vllm.v1.core.sched.output import SchedulerOutput from .utils import create_scheduler, create_vllm_config if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import VllmConfig from vllm.forward_context import ForwardContext from vllm.v1.core.kv_cache_manager import KVCacheBlocks @@ -68,7 +68,7 @@ class OldStyleTestConnector(KVConnectorBase_V1): self, layer_name: str, kv_layer, - attn_metadata: "AttentionMetadata", + attn_metadata: AttentionMetadata, **kwargs, ) -> None: pass @@ -119,7 +119,7 @@ class NewStyleTestConnector(KVConnectorBase_V1): self, layer_name: str, kv_layer, - attn_metadata: "AttentionMetadata", + attn_metadata: AttentionMetadata, **kwargs, ) -> None: pass diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index a321167b8090f..c290670eeacb0 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -6,11 +6,10 @@ from typing import TYPE_CHECKING, ClassVar, Generic, Protocol, TypeVar, get_args import torch -from vllm.model_executor.layers.linear import ColumnParallelLinear -from vllm.model_executor.layers.quantization.utils.quant_utils import QuantKey - if TYPE_CHECKING: from vllm.config.cache import CacheDType + from vllm.model_executor.layers.linear import ColumnParallelLinear + from vllm.model_executor.layers.quantization.utils.quant_utils import QuantKey from vllm.platforms.interface import DeviceCapability from vllm.v1.attention.backends.utils import KVCacheLayoutType @@ -178,8 +177,6 @@ class AttentionBackend(ABC): By default, only supports decoder attention. Backends should override this to support other attention types. """ - from vllm.attention.backends.abstract import AttentionType - return attn_type == AttentionType.DECODER @classmethod @@ -360,7 +357,7 @@ class AttentionImpl(ABC, Generic[T]): ) -> torch.Tensor: raise NotImplementedError - def fused_output_quant_supported(self, quant_key: QuantKey): + def fused_output_quant_supported(self, quant_key: "QuantKey"): """ Does this attention implementation support fused output quantization. This is used by the AttnFusionPass to only fuse output quantization @@ -412,7 +409,7 @@ class MLAAttentionImpl(AttentionImpl[T], Generic[T]): qk_rope_head_dim: int, qk_head_dim: int, v_head_dim: int, - kv_b_proj: ColumnParallelLinear, + kv_b_proj: "ColumnParallelLinear", indexer: object | None = None, ) -> None: raise NotImplementedError diff --git a/vllm/attention/layers/chunked_local_attention.py b/vllm/attention/layers/chunked_local_attention.py index 48fcc6fa736bb..0ced0028ded9e 100644 --- a/vllm/attention/layers/chunked_local_attention.py +++ b/vllm/attention/layers/chunked_local_attention.py @@ -5,6 +5,7 @@ import functools import torch from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata +from vllm.attention.layer import Attention from vllm.attention.selector import get_attn_backend from vllm.config import CacheConfig from vllm.config.vllm import VllmConfig @@ -22,8 +23,6 @@ from vllm.v1.kv_cache_interface import ( KVCacheSpec, ) -from ..layer import Attention - @functools.lru_cache def create_chunked_local_attention_backend( diff --git a/vllm/config/model.py b/vllm/config/model.py index 84311596b660c..5dabd636c18c6 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -14,6 +14,7 @@ from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE from transformers.configuration_utils import ALLOWED_LAYER_TYPES import vllm.envs as envs +from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config.multimodal import MMCacheType, MMEncoderTPMode, MultiModalConfig from vllm.config.pooler import PoolerConfig from vllm.config.scheduler import RunnerType @@ -53,7 +54,6 @@ if TYPE_CHECKING: import vllm.model_executor.layers.quantization as me_quant import vllm.model_executor.models as me_models - from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config.load import LoadConfig from vllm.config.parallel import ParallelConfig from vllm.model_executor.layers.quantization import QuantizationMethods @@ -61,7 +61,6 @@ if TYPE_CHECKING: else: PretrainedConfig = Any - AttentionBackendEnum = Any me_quant = LazyLoader( "model_executor", globals(), "vllm.model_executor.layers.quantization" ) diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py index 590bc4dcd0760..8a2936de96d6f 100644 --- a/vllm/config/multimodal.py +++ b/vllm/config/multimodal.py @@ -2,19 +2,15 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Mapping -from typing import TYPE_CHECKING, Any, Literal, TypeAlias +from typing import Any, Literal, TypeAlias from pydantic import ConfigDict, Field, field_validator, model_validator from pydantic.dataclasses import dataclass +from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config.utils import config from vllm.utils.hashing import safe_hash -if TYPE_CHECKING: - from vllm.attention.backends.registry import AttentionBackendEnum -else: - AttentionBackendEnum = Any - @dataclass class BaseDummyOptions: @@ -170,9 +166,6 @@ class MultiModalConfig: def _validate_mm_encoder_attn_backend( cls, value: str | AttentionBackendEnum | None ) -> AttentionBackendEnum | None: - # We need to import the real type here (deferred to avoid circular import). - from vllm.attention.backends.registry import AttentionBackendEnum - if isinstance(value, str) and value.upper() == "XFORMERS": raise ValueError( "Attention backend 'XFORMERS' has been removed (See PR #29262 for " diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 74f09278b7bb1..cac45425bb7aa 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -42,12 +42,12 @@ from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional import torch +from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata from vllm.logger import init_logger from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.outputs import KVConnectorOutput if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata from vllm.config import VllmConfig from vllm.distributed.kv_events import KVCacheEvent from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( @@ -239,7 +239,7 @@ class KVConnectorBase_V1(ABC): return def register_cross_layers_kv_cache( - self, kv_cache: torch.Tensor, attn_backend: type["AttentionBackend"] + self, kv_cache: torch.Tensor, attn_backend: type[AttentionBackend] ): """ Initialize with a single KV cache tensor used by all layers. diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py index 9cd7d93c92fa3..e9b2bd392b0ef 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py @@ -36,6 +36,7 @@ from typing import TYPE_CHECKING, Any, Optional import torch +from vllm.attention.backends.abstract import AttentionMetadata from vllm.distributed.kv_transfer.kv_connector.v1 import ( KVConnectorBase_V1, KVConnectorRole, @@ -45,7 +46,6 @@ from vllm.logger import init_logger from vllm.utils.math_utils import cdiv if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import VllmConfig from vllm.forward_context import ForwardContext from vllm.v1.core.kv_cache_manager import KVCacheBlocks @@ -117,7 +117,7 @@ class DecodeBenchConnector(KVConnectorBase_V1): self, layer_name: str, kv_layer: torch.Tensor, - attn_metadata: "AttentionMetadata", + attn_metadata: AttentionMetadata, **kwargs: Any, ) -> None: # This connector doesn't save KV cache (benchmarking only) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py index 0c24a53fb754b..30da424ddcca0 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py @@ -7,6 +7,7 @@ from lmcache.integration.vllm.vllm_v1_adapter import ( LMCacheConnectorV1Impl as LMCacheConnectorLatestImpl, ) +from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, @@ -17,7 +18,6 @@ from vllm.logger import init_logger from vllm.v1.core.sched.output import SchedulerOutput if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionMetadata from vllm.forward_context import ForwardContext from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.kv_cache_interface import KVCacheConfig @@ -91,7 +91,7 @@ class LMCacheConnectorV1(KVConnectorBase_V1): self, layer_name: str, kv_layer: torch.Tensor, - attn_metadata: "AttentionMetadata", + attn_metadata: AttentionMetadata, **kwargs: Any, ) -> None: """ diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py index 94572b02fa872..15ac5b049fce9 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py @@ -29,6 +29,7 @@ from lmcache.v1.lookup_client.lmcache_async_lookup_client import ( from lmcache.v1.offload_server.zmq_server import ZMQOffloadServer from lmcache.v1.plugin.plugin_launcher import PluginLauncher +from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, @@ -50,7 +51,6 @@ from vllm.v1.core.sched.output import SchedulerOutput from vllm.version import __version__ as VLLM_VERSION if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionMetadata from vllm.forward_context import ForwardContext from vllm.multimodal.inputs import PlaceholderRange from vllm.v1.core.kv_cache_manager import KVCacheManager @@ -915,7 +915,7 @@ class LMCacheConnectorV1Impl: self, layer_name: str, kv_layer: torch.Tensor, - attn_metadata: "AttentionMetadata", + attn_metadata: AttentionMetadata, **kwargs, ) -> None: """Start saving the a layer of KV cache from vLLM's paged buffer diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py index d1d3e475cc889..a4bddf5e03166 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py @@ -10,6 +10,7 @@ import zmq from lmcache.integration.vllm.utils import mla_enabled from lmcache.utils import init_logger as lmcache_init_logger +from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, @@ -26,7 +27,6 @@ from vllm.v1.outputs import KVConnectorOutput from vllm.v1.utils import ConstantList if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import VllmConfig from vllm.distributed.kv_events import KVCacheEvent from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( @@ -490,7 +490,7 @@ class LMCacheMPConnector(KVConnectorBase_V1): self, layer_name: str, kv_layer: torch.Tensor, - attn_metadata: "AttentionMetadata", + attn_metadata: AttentionMetadata, **kwargs: Any, ) -> None: """ diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py index c9d08e9b78ed0..f47e8ca7e6c50 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py @@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any import torch +from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import VllmConfig from vllm.config.kv_transfer import KVTransferConfig from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType @@ -27,7 +28,6 @@ from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.outputs import KVConnectorOutput if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionMetadata from vllm.distributed.kv_events import KVCacheEvent from vllm.forward_context import ForwardContext from vllm.v1.core.kv_cache_manager import KVCacheBlocks @@ -216,7 +216,7 @@ class MultiConnector(KVConnectorBase_V1): self, layer_name: str, kv_layer: torch.Tensor, - attn_metadata: "AttentionMetadata", + attn_metadata: AttentionMetadata, **kwargs, ) -> None: for c in self._connectors: diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index d5edf84e8e7f1..24c8d32dafedc 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -20,7 +20,7 @@ import torch import zmq from vllm import envs -from vllm.attention.backends.abstract import AttentionBackend +from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata from vllm.attention.backends.registry import AttentionBackendEnum from vllm.attention.selector import get_attn_backend from vllm.config import VllmConfig @@ -51,7 +51,6 @@ from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.worker.block_table import BlockTable if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionMetadata from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.request import Request @@ -308,7 +307,7 @@ class NixlConnector(KVConnectorBase_V1): self, layer_name: str, kv_layer: torch.Tensor, - attn_metadata: "AttentionMetadata", + attn_metadata: AttentionMetadata, **kwargs, ) -> None: """NixlConnector does not save explicitly.""" diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index a124a0d519db8..8f3a62d7bcdb0 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any, Optional import regex as re import torch +from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, @@ -22,7 +23,6 @@ from vllm.v1.attention.backends.mla.common import MLACommonMetadata from vllm.v1.core.sched.output import SchedulerOutput if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionMetadata from vllm.forward_context import ForwardContext from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.kv_cache_interface import KVCacheConfig @@ -243,7 +243,7 @@ class P2pNcclConnector(KVConnectorBase_V1): self, layer_name: str, kv_layer: torch.Tensor, - attn_metadata: "AttentionMetadata", + attn_metadata: AttentionMetadata, **kwargs: Any, ) -> None: """Start saving the KV cache of the layer from vLLM's paged buffer diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index 4611b4d1ff7b8..ed641cfc43ddd 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any, Optional import safetensors import torch +from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, @@ -19,7 +20,6 @@ from vllm.v1.attention.backends.mla.common import MLACommonMetadata from vllm.v1.core.sched.output import SchedulerOutput if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionMetadata from vllm.forward_context import ForwardContext from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.kv_cache_interface import KVCacheConfig @@ -211,7 +211,7 @@ class SharedStorageConnector(KVConnectorBase_V1): self, layer_name: str, kv_layer: torch.Tensor, - attn_metadata: "AttentionMetadata", + attn_metadata: AttentionMetadata, **kwargs: Any, ) -> None: """Start saving the KV cache of the layer from vLLM's paged buffer diff --git a/vllm/forward_context.py b/vllm/forward_context.py index 635419bc7cad4..173d366267e87 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -5,19 +5,17 @@ import time from collections import defaultdict from contextlib import contextmanager from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, NamedTuple +from typing import Any, NamedTuple import torch import vllm.envs as envs +from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import CUDAGraphMode, ParallelConfig, VllmConfig from vllm.logger import init_logger from vllm.v1.worker.dp_utils import coordinate_batch_across_dp from vllm.v1.worker.ubatch_utils import UBatchSlices -if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionMetadata - logger = init_logger(__name__) track_batchsize: bool = envs.VLLM_LOG_BATCHSIZE_INTERVAL >= 0 @@ -195,7 +193,7 @@ class ForwardContext: for each microbatch. Set dynamically for each forward pass """ - attn_metadata: dict[str, "AttentionMetadata"] | list[dict[str, "AttentionMetadata"]] + attn_metadata: dict[str, AttentionMetadata] | list[dict[str, AttentionMetadata]] # TODO: remove after making all virtual_engines share the same kv cache virtual_engine: int # set dynamically for each forward pass # set dynamically for each forward pass diff --git a/vllm/model_executor/layers/attention_layer_base.py b/vllm/model_executor/layers/attention_layer_base.py index ffbef470b1868..a60cf787135c0 100644 --- a/vllm/model_executor/layers/attention_layer_base.py +++ b/vllm/model_executor/layers/attention_layer_base.py @@ -3,14 +3,11 @@ """Base class for attention-like layers.""" from abc import ABC, abstractmethod -from typing import TYPE_CHECKING +from vllm.attention.backends.abstract import AttentionBackend from vllm.config import VllmConfig from vllm.v1.kv_cache_interface import KVCacheSpec -if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionBackend - class AttentionLayerBase(ABC): """ @@ -22,7 +19,7 @@ class AttentionLayerBase(ABC): """ @abstractmethod - def get_attn_backend(self) -> type["AttentionBackend"]: + def get_attn_backend(self) -> type[AttentionBackend]: """Get the attention backend class for this layer.""" pass diff --git a/vllm/model_executor/layers/mamba/abstract.py b/vllm/model_executor/layers/mamba/abstract.py index aa919d6fdc35c..74f4383e9c238 100644 --- a/vllm/model_executor/layers/mamba/abstract.py +++ b/vllm/model_executor/layers/mamba/abstract.py @@ -2,18 +2,15 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import abstractmethod from collections.abc import Iterable -from typing import TYPE_CHECKING import torch +from vllm.attention.backends.abstract import AttentionBackend from vllm.attention.selector import get_mamba_attn_backend from vllm.config import VllmConfig from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.v1.kv_cache_interface import KVCacheSpec, MambaSpec -if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionBackend - class MambaBase(AttentionLayerBase): """ @@ -66,6 +63,6 @@ class MambaBase(AttentionLayerBase): ), ) - def get_attn_backend(self) -> type["AttentionBackend"]: + def get_attn_backend(self) -> type[AttentionBackend]: """Get the attention backend class for this Mamba layer.""" return get_mamba_attn_backend(self.mamba_type) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 7f61746a4e45c..f9d8f5883680b 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -18,6 +18,7 @@ from compressed_tensors.quantization import ( from compressed_tensors.transform import TransformConfig import vllm.envs as envs +from vllm.attention.layer import Attention from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import ( @@ -131,8 +132,6 @@ class CompressedTensorsConfig(QuantizationConfig): layer: torch.nn.Module, prefix: str, ) -> Optional["QuantizeMethodBase"]: - from vllm.attention.layer import Attention # Avoid circular import - if isinstance(layer, LinearBase): # collect schemes quant_scheme = self.get_scheme(layer=layer, layer_name=prefix) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index e033032903e87..7dfc8a9c36c3e 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -14,6 +14,7 @@ import vllm.envs as envs import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm import _custom_ops as ops from vllm._aiter_ops import rocm_aiter_ops +from vllm.attention.layer import Attention from vllm.distributed import get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.batch_invariant import ( @@ -277,7 +278,6 @@ class Fp8Config(QuantizationConfig): def get_xpu_quant_method( self, layer: torch.nn.Module, prefix: str ) -> Optional["QuantizeMethodBase"]: - from vllm.attention.layer import Attention from vllm.model_executor.layers.quantization.ipex_quant import ( XPUFp8LinearMethod, XPUFp8MoEMethod, @@ -307,8 +307,6 @@ class Fp8Config(QuantizationConfig): def get_quant_method( self, layer: torch.nn.Module, prefix: str ) -> Optional["QuantizeMethodBase"]: - from vllm.attention.layer import Attention # Avoid circular import - if current_platform.is_xpu(): return self.get_xpu_quant_method(layer, prefix) if isinstance(layer, LinearBase): diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 2cf7089e0ff90..80f8e3a03e7cf 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -12,6 +12,7 @@ from torch.nn.parameter import Parameter import vllm.envs as envs import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant +from vllm.attention.layer import Attention from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, @@ -149,8 +150,6 @@ class ModelOptQuantConfigBase(QuantizationConfig): def get_quant_method( self, layer: torch.nn.Module, prefix: str ) -> Optional["QuantizeMethodBase"]: - from vllm.attention.layer import Attention # Avoid circular import - # handle kv-cache first so we can focus only on weight quantization thereafter if isinstance(layer, Attention): return self.KVCacheMethodCls(self) diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index d975131f7cff7..bc241ac692e23 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -8,6 +8,7 @@ import torch from torch.nn.parameter import Parameter from vllm import envs +from vllm.attention.layer import Attention from vllm.config import get_current_vllm_config from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import ( @@ -184,8 +185,6 @@ class Mxfp4Config(QuantizationConfig): def get_quant_method( self, layer: torch.nn.Module, prefix: str ) -> Optional["QuantizeMethodBase"]: - from vllm.attention.layer import Attention # Avoid circular import - if isinstance(layer, LinearBase): if self.ignored_layers and is_layer_skipped( prefix=prefix, diff --git a/vllm/model_executor/layers/quantization/petit.py b/vllm/model_executor/layers/quantization/petit.py index 402cebc38c215..5ccc73166361a 100644 --- a/vllm/model_executor/layers/quantization/petit.py +++ b/vllm/model_executor/layers/quantization/petit.py @@ -8,6 +8,7 @@ import regex as re import torch from torch.nn.parameter import Parameter +from vllm.attention.layer import Attention from vllm.logger import init_logger from vllm.model_executor.layers.linear import ( LinearBase, @@ -159,8 +160,6 @@ class PetitNvFp4Config(QuantizationConfig): def get_quant_method( self, layer: torch.nn.Module, prefix: str ) -> Optional["QuantizeMethodBase"]: - from vllm.attention.layer import Attention # Avoid circular import - exclude = self.require_exclude_modules() if isinstance(layer, LinearBase): diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py index 26ba8e5b16bc0..ed8a2c7fa0841 100644 --- a/vllm/model_executor/layers/quantization/ptpc_fp8.py +++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py @@ -7,6 +7,7 @@ import torch from torch.nn.parameter import Parameter from vllm import _custom_ops as ops +from vllm.attention.layer import Attention from vllm.logger import init_logger from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod from vllm.model_executor.layers.quantization import QuantizationMethods @@ -65,8 +66,6 @@ class PTPCFp8Config(Fp8Config): def get_quant_method( self, layer: torch.nn.Module, prefix: str ) -> Optional["QuantizeMethodBase"]: - from vllm.attention.layer import Attention # Avoid circular import - if isinstance(layer, LinearBase): if is_layer_skipped(prefix, self.ignored_layers): return UnquantizedLinearMethod() diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py index f59e5e2a0af7a..3640e5c452786 100644 --- a/vllm/model_executor/layers/quantization/quark/quark.py +++ b/vllm/model_executor/layers/quantization/quark/quark.py @@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Any, Optional, cast import torch +from vllm.attention.layer import Attention from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import ( @@ -102,8 +103,6 @@ class QuarkConfig(QuantizationConfig): def get_quant_method( self, layer: torch.nn.Module, prefix: str ) -> Optional["QuantizeMethodBase"]: - from vllm.attention.layer import Attention # Avoid circular import - # Check if the layer is skipped for quantization. exclude_layers = cast(list[str], self.quant_config.get("exclude")) if should_ignore_layer( diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index ed655912d3964..5f9561366e0d5 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -14,6 +14,7 @@ import regex as re import torch from vllm import envs +from vllm.attention.backends.registry import AttentionBackendEnum from vllm.logger import init_logger from .interface import CpuArchEnum, Platform, PlatformEnum @@ -21,10 +22,8 @@ from .interface import CpuArchEnum, Platform, PlatformEnum logger = init_logger(__name__) if TYPE_CHECKING: - from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config import VllmConfig else: - AttentionBackendEnum = None VllmConfig = None @@ -135,8 +134,6 @@ class CpuPlatform(Platform): use_sparse: bool, attn_type: str | None = None, ) -> str: - from vllm.attention.backends.registry import AttentionBackendEnum - if selected_backend and selected_backend != AttentionBackendEnum.CPU_ATTN: logger.info("Cannot use %s backend on CPU.", selected_backend) if use_mla: diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index e8e14387bb7f6..d5c3a177d9c2b 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -15,6 +15,8 @@ from typing_extensions import ParamSpec # import custom ops, trigger op registration import vllm._C # noqa import vllm.envs as envs +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.backends.registry import AttentionBackendEnum from vllm.logger import init_logger from vllm.utils.import_utils import import_pynvml from vllm.utils.torch_utils import cuda_device_count_stateless @@ -22,11 +24,9 @@ from vllm.utils.torch_utils import cuda_device_count_stateless from .interface import DeviceCapability, Platform, PlatformEnum if TYPE_CHECKING: - from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config import VllmConfig from vllm.config.cache import CacheDType else: - AttentionBackendEnum = None VllmConfig = None CacheDType = None @@ -48,8 +48,6 @@ def _get_backend_priorities( device_capability: DeviceCapability, ) -> list[AttentionBackendEnum]: """Get backend priorities with lazy import to avoid circular dependency.""" - from vllm.attention.backends.registry import AttentionBackendEnum - if use_mla: if device_capability.major == 10: return [ @@ -265,8 +263,6 @@ class CudaPlatformBase(Platform): def get_vit_attn_backend( cls, head_size: int, dtype: torch.dtype ) -> "AttentionBackendEnum": - from vllm.attention.backends.registry import AttentionBackendEnum - # Try FlashAttention first try: backend_class = AttentionBackendEnum.FLASH_ATTN.get_class() @@ -335,8 +331,6 @@ class CudaPlatformBase(Platform): use_sparse: bool, attn_type: str | None = None, ) -> str: - from vllm.attention.backends.abstract import AttentionType - if attn_type is None: attn_type = AttentionType.DECODER diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 1e6b53021f888..27c6fac09f498 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -12,12 +12,12 @@ from typing import TYPE_CHECKING, Any, NamedTuple import numpy as np import torch +from vllm.attention.backends.registry import AttentionBackendEnum from vllm.logger import init_logger if TYPE_CHECKING: from torch.distributed import PrefixStore, ProcessGroup - from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config import VllmConfig from vllm.config.cache import CacheDType from vllm.inputs import ProcessorInputs, PromptType @@ -226,9 +226,6 @@ class Platform: def get_vit_attn_backend( cls, head_size: int, dtype: torch.dtype ) -> "AttentionBackendEnum": - # Import AttentionBackendEnum here to avoid circular import. - from vllm.attention.backends.registry import AttentionBackendEnum - return AttentionBackendEnum.TORCH_SDPA @classmethod diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 0483f6c06ada8..ccf3446a3a6e5 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -8,16 +8,14 @@ from typing import TYPE_CHECKING import torch import vllm.envs as envs +from vllm.attention.backends.registry import AttentionBackendEnum from vllm.logger import init_logger from vllm.utils.torch_utils import cuda_device_count_stateless from .interface import DeviceCapability, Platform, PlatformEnum if TYPE_CHECKING: - from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config import VllmConfig -else: - AttentionBackendEnum = None logger = init_logger(__name__) @@ -196,7 +194,6 @@ class RocmPlatform(Platform): from importlib.util import find_spec from vllm._aiter_ops import rocm_aiter_ops - from vllm.attention.backends.registry import AttentionBackendEnum if rocm_aiter_ops.is_mha_enabled(): # Note: AITER FA is only supported for Qwen-VL models. @@ -222,7 +219,6 @@ class RocmPlatform(Platform): attn_type: str | None = None, ) -> str: from vllm._aiter_ops import rocm_aiter_ops - from vllm.attention.backends.registry import AttentionBackendEnum if use_sparse: if kv_cache_dtype.startswith("fp8"): diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index 04325a522f444..cbc0a996f3661 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, cast import torch from tpu_info import device +from vllm.attention.backends.registry import AttentionBackendEnum from vllm.inputs import ProcessorInputs, PromptType from vllm.logger import init_logger @@ -15,7 +16,6 @@ from .interface import Platform, PlatformEnum if TYPE_CHECKING: from typing import TypeAlias - from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config import VllmConfig from vllm.config.cache import BlockSize from vllm.pooling_params import PoolingParams @@ -26,7 +26,6 @@ else: BlockSize = None VllmConfig = None PoolingParams = None - AttentionBackendEnum = None ParamsType = None logger = init_logger(__name__) @@ -67,8 +66,6 @@ class TpuPlatform(Platform): use_sparse, attn_type: str | None = None, ) -> str: - from vllm.attention.backends.registry import AttentionBackendEnum - if use_sparse: raise NotImplementedError("Sparse Attention is not supported on TPU.") if selected_backend != AttentionBackendEnum.PALLAS: diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 18a3186b142f1..768714fb16726 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -8,16 +8,15 @@ from typing import TYPE_CHECKING import torch import vllm.envs as envs +from vllm.attention.backends.registry import AttentionBackendEnum from vllm.logger import init_logger from .interface import DeviceCapability, Platform, PlatformEnum if TYPE_CHECKING: - from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config import VllmConfig else: VllmConfig = None - AttentionBackendEnum = None logger = init_logger(__name__) @@ -60,8 +59,6 @@ class XPUPlatform(Platform): "only NHD layout is supported by XPU attention kernels." ) - from vllm.attention.backends.registry import AttentionBackendEnum - if use_sparse: raise NotImplementedError("Sparse Attention is not supported on XPU.") if selected_backend == AttentionBackendEnum.TRITON_ATTN: @@ -116,8 +113,6 @@ class XPUPlatform(Platform): def get_vit_attn_backend( cls, head_size: int, dtype: torch.dtype ) -> "AttentionBackendEnum": - from vllm.attention.backends.registry import AttentionBackendEnum - return AttentionBackendEnum.FLASH_ATTN @classmethod diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py index d0b1f8c1b8071..fed7dcdf293bd 100644 --- a/vllm/v1/attention/backends/cpu_attn.py +++ b/vllm/v1/attention/backends/cpu_attn.py @@ -51,8 +51,6 @@ class CPUAttentionBackend(AttentionBackend): @classmethod def supports_attn_type(cls, attn_type: str) -> bool: """CPU attention supports decoder and encoder-only attention.""" - from vllm.attention.backends.abstract import AttentionType - return attn_type in ( AttentionType.DECODER, AttentionType.ENCODER, diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index a1558073003fd..fb080b0b33bc0 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -84,8 +84,6 @@ class FlashAttentionBackend(AttentionBackend): @classmethod def supports_attn_type(cls, attn_type: str) -> bool: """FlashAttention supports all attention types.""" - from vllm.attention.backends.abstract import AttentionType - return attn_type in ( AttentionType.DECODER, AttentionType.ENCODER, diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py index 3869f1f4164c9..8de0a0a11471f 100644 --- a/vllm/v1/attention/backends/flex_attention.py +++ b/vllm/v1/attention/backends/flex_attention.py @@ -87,8 +87,6 @@ class FlexAttentionBackend(AttentionBackend): @classmethod def supports_attn_type(cls, attn_type: str) -> bool: """FlexAttention supports both decoder and encoder-only attention.""" - from vllm.attention.backends.abstract import AttentionType - return attn_type in (AttentionType.DECODER, AttentionType.ENCODER_ONLY) @staticmethod diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index ea9dccc702a0a..6e0d84e4fb4ac 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -24,12 +24,15 @@ from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.utils.math_utils import cdiv if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionImpl from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.worker.gpu_input_batch import InputBatch import vllm.envs as envs -from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata +from vllm.attention.backends.abstract import ( + AttentionBackend, + AttentionImpl, + AttentionMetadata, +) from vllm.distributed.kv_transfer.kv_connector.utils import ( get_kv_connector_cache_layout, ) diff --git a/vllm/v1/kv_offload/spec.py b/vllm/v1/kv_offload/spec.py index 3afce55890752..2cdd5ba5ffe5c 100644 --- a/vllm/v1/kv_offload/spec.py +++ b/vllm/v1/kv_offload/spec.py @@ -6,12 +6,12 @@ from typing import TYPE_CHECKING import torch +from vllm.attention.backends.abstract import AttentionBackend from vllm.logger import init_logger from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager from vllm.v1.kv_offload.worker.worker import OffloadingHandler if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionBackend from vllm.config import VllmConfig logger = init_logger(__name__) @@ -51,7 +51,7 @@ class OffloadingSpec(ABC): def get_handlers( self, kv_caches: dict[str, torch.Tensor], - attn_backends: dict[str, type["AttentionBackend"]], + attn_backends: dict[str, type[AttentionBackend]], ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]: """ Get offloading handlers along with their respective src and dst types. diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 7b9037c03d4f0..7600df48150ac 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -8,6 +8,7 @@ import numpy as np import torch import torch.nn as nn +from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config import ( CompilationMode, CUDAGraphMode, @@ -157,8 +158,6 @@ class EagleProposer: ) # Determine allowed attention backends once during initialization. - from vllm.attention.backends.registry import AttentionBackendEnum - self.allowed_attn_types: tuple | None = None if current_platform.is_rocm(): rocm_types = [TritonAttentionMetadata, FlashAttentionMetadata] diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 92e4ce3abdba3..bd88cb1b253f8 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -2,11 +2,11 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections import defaultdict from dataclasses import dataclass, field -from typing import TYPE_CHECKING import torch from vllm.attention.backends.abstract import AttentionBackend +from vllm.attention.layer import Attention from vllm.config import ModelConfig, SchedulerConfig, VllmConfig from vllm.model_executor.models.interfaces import MultiModalEmbeddings from vllm.model_executor.models.utils import extract_layer_index @@ -17,9 +17,6 @@ from vllm.v1.attention.backends.utils import AttentionMetadataBuilder from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget from vllm.v1.kv_cache_interface import KVCacheGroupSpec, KVCacheSpec -if TYPE_CHECKING: - from vllm.attention.layer import Attention - class MultiModalBudget: """Helper class to calculate budget information for multi-modal models.""" @@ -278,7 +275,7 @@ def add_kv_sharing_layers_to_kv_cache_groups( def bind_kv_cache( kv_caches: dict[str, torch.Tensor], - forward_context: dict[str, "Attention"], + forward_context: dict[str, Attention], runner_kv_caches: list[torch.Tensor], num_attn_module: int = 1, ) -> None: From e1f262337bcf774032019b5b717a6297a860f190 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 27 Nov 2025 16:42:14 +0000 Subject: [PATCH 038/197] Update Transformers pin in CI to 4.57.3 (#29418) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- requirements/nightly_torch_test.txt | 2 +- requirements/test.in | 2 +- requirements/test.txt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt index d9c5d89c1d52f..53b012372be8e 100644 --- a/requirements/nightly_torch_test.txt +++ b/requirements/nightly_torch_test.txt @@ -29,7 +29,7 @@ opencv-python-headless >= 4.11.0 # required for video test datamodel_code_generator # required for minicpm3 test lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test mteb>=1.38.11, <2 # required for mteb test -transformers==4.57.1 +transformers==4.57.3 tokenizers==0.22.0 schemathesis>=3.39.15 # Required for openai schema test. # quantization diff --git a/requirements/test.in b/requirements/test.in index 05f6bcca5c2c4..da7a7db1f00c9 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -37,7 +37,7 @@ datamodel_code_generator # required for minicpm3 test # TODO: Use lm-eval[api]==0.4.10 once released lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test mteb[bm25s]>=2, <3 # required for mteb test -transformers==4.57.1 +transformers==4.57.3 tokenizers==0.22.0 schemathesis>=3.39.15 # Required for openai schema test. # quantization diff --git a/requirements/test.txt b/requirements/test.txt index bcd511660f85e..c5f103b8b0d78 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1196,7 +1196,7 @@ tqdm==4.66.6 # transformers tqdm-multiprocess==0.0.11 # via lm-eval -transformers==4.57.1 +transformers==4.57.3 # via # -r requirements/test.in # genai-perf From 0840abdd242bbc7d0c42f0bfa73fec94a44e921b Mon Sep 17 00:00:00 2001 From: Injae Ryou Date: Fri, 28 Nov 2025 01:53:10 +0900 Subject: [PATCH 039/197] [BugFix] Optional tokenizer argument when loading GGUF models (#29582) Signed-off-by: Injae Ryou Signed-off-by: Isotr0py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Isotr0py --- vllm/config/model.py | 15 +++++----- vllm/transformers_utils/gguf_utils.py | 42 +++++++++++++++++++++++++++ vllm/transformers_utils/tokenizer.py | 10 ++++++- 3 files changed, 59 insertions(+), 8 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index 5dabd636c18c6..21d602b30ac1a 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -439,13 +439,6 @@ class ModelConfig: self.model = maybe_model_redirect(self.model) # The tokenizer is consistent with the model by default. if self.tokenizer is None: - # Check if this is a GGUF model (either local file or remote GGUF) - if is_gguf(self.model): - raise ValueError( - "Using a tokenizer is mandatory when loading a GGUF model. " - "Please specify the tokenizer path or name using the " - "--tokenizer argument." - ) self.tokenizer = self.model if self.tokenizer_revision is None: self.tokenizer_revision = self.revision @@ -699,6 +692,14 @@ class ModelConfig: self.multimodal_config = MultiModalConfig(**mm_config_kwargs) + # Multimodal GGUF models must use original repo for mm processing + if is_gguf(self.tokenizer) and self.is_multimodal_model: + raise ValueError( + "Loading a multimodal GGUF model needs to use original " + "tokenizer. Please specify the unquantized hf model's " + "repo name or path using the --tokenizer argument." + ) + if self.disable_sliding_window: # Set after get_and_verify_max_len to ensure that max_model_len # can be correctly capped to sliding window size diff --git a/vllm/transformers_utils/gguf_utils.py b/vllm/transformers_utils/gguf_utils.py index 2bf59c91a3bb1..f727b1b4726bb 100644 --- a/vllm/transformers_utils/gguf_utils.py +++ b/vllm/transformers_utils/gguf_utils.py @@ -9,6 +9,7 @@ from gguf.constants import Keys, VisionProjectorType from transformers import Gemma3Config, PretrainedConfig, SiglipVisionConfig from vllm.logger import init_logger +from vllm.transformers_utils.config import list_filtered_repo_files logger = init_logger(__name__) @@ -164,3 +165,44 @@ def maybe_patch_hf_config_from_gguf( hf_config = new_hf_config return hf_config + + +def get_gguf_file_path_from_hf( + repo_id: str | Path, + quant_type: str, + revision: str | None = None, +) -> str: + """Get the GGUF file path from HuggingFace Hub based on repo_id and quant_type. + + Args: + repo_id: The HuggingFace repository ID (e.g., "Qwen/Qwen3-0.6B") + quant_type: The quantization type (e.g., "Q4_K_M", "F16") + revision: Optional revision/branch name + + Returns: + The path to the GGUF file on HuggingFace Hub (e.g., "filename.gguf"), + """ + repo_id = str(repo_id) + gguf_patterns = [ + f"*-{quant_type}.gguf", + f"*-{quant_type}-*.gguf", + f"*/*-{quant_type}.gguf", + f"*/*-{quant_type}-*.gguf", + ] + matching_files = list_filtered_repo_files( + repo_id, + allow_patterns=gguf_patterns, + revision=revision, + ) + + if len(matching_files) == 0: + raise ValueError( + "Could not find GGUF file for repo %s with quantization %s.", + repo_id, + quant_type, + ) + + # Sort to ensure consistent ordering (prefer non-sharded files) + matching_files.sort(key=lambda x: (x.count("-"), x)) + gguf_filename = matching_files[0] + return gguf_filename diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index f0e0ba8ef4246..929dc8bf481cb 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -19,6 +19,7 @@ from vllm.transformers_utils.config import ( get_sentence_transformer_tokenizer_config, list_filtered_repo_files, ) +from vllm.transformers_utils.gguf_utils import get_gguf_file_path_from_hf from vllm.transformers_utils.tokenizers import MistralTokenizer from vllm.transformers_utils.utils import ( check_gguf_file, @@ -190,7 +191,14 @@ def get_tokenizer( kwargs["gguf_file"] = Path(tokenizer_name).name tokenizer_name = Path(tokenizer_name).parent elif is_remote_gguf(tokenizer_name): - tokenizer_name, _ = split_remote_gguf(tokenizer_name) + tokenizer_name, quant_type = split_remote_gguf(tokenizer_name) + # Get the HuggingFace Hub path for the GGUF file + gguf_file = get_gguf_file_path_from_hf( + tokenizer_name, + quant_type, + revision=revision, + ) + kwargs["gguf_file"] = gguf_file # if `tokenizer_mode` == "auto", check if tokenizer can be loaded via Mistral format # first to use official Mistral tokenizer if possible. From ee9841daa995a606139775043f0199d6a81037b3 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 28 Nov 2025 01:08:08 +0800 Subject: [PATCH 040/197] [Bugfix] Fix doc build on main (#29619) Signed-off-by: DarkLight1337 --- vllm/model_executor/models/interfaces_base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py index 4267b6c6598e2..85c5574bacf0a 100644 --- a/vllm/model_executor/models/interfaces_base.py +++ b/vllm/model_executor/models/interfaces_base.py @@ -167,8 +167,7 @@ class VllmModelForPooling(VllmModel[T_co], Protocol[T_co]): default_pooling_type: ClassVar[str] = "LAST" """ - Indicates the - [vllm.model_executor.layers.pooler.PoolerConfig.pooling_type][] + Indicates the [vllm.config.pooler.PoolerConfig.pooling_type][] to use by default. You can use the From d45269b37844b992dc4b34c0509ad8319bc043e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9E=9C=E5=86=BB=E8=99=BE=E4=BB=81?= Date: Fri, 28 Nov 2025 01:21:00 +0800 Subject: [PATCH 041/197] add skip_reading_prefix_cache in repr for PoolingParams (#29620) --- vllm/pooling_params.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py index d1aab98c274e1..c2094a2d920a2 100644 --- a/vllm/pooling_params.py +++ b/vllm/pooling_params.py @@ -219,6 +219,7 @@ class PoolingParams( f"step_tag_id={self.step_tag_id}, " f"returned_token_ids={self.returned_token_ids}, " f"requires_token_ids={self.requires_token_ids}, " + f"skip_reading_prefix_cache={self.skip_reading_prefix_cache}, " f"extra_kwargs={self.extra_kwargs})" ) From ea228b4491342f6b7a283e1a414e1a75171a0241 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 28 Nov 2025 02:39:59 +0800 Subject: [PATCH 042/197] [Misc] Remove unused code from `protocol.py` (#29616) Signed-off-by: DarkLight1337 --- vllm/engine/protocol.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 5e3374f9f6a10..6b3ee042daf3e 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -1,14 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import enum from abc import ABC, abstractmethod from collections.abc import AsyncGenerator, Iterable, Mapping from typing import Any from vllm.config import ModelConfig, VllmConfig from vllm.inputs.data import PromptType -from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.outputs import PoolingRequestOutput, RequestOutput from vllm.plugins.io_processors import IOProcessor @@ -19,13 +17,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.processor import Processor -logger = init_logger(__name__) - - -class Device(enum.Enum): - GPU = enum.auto() - CPU = enum.auto() - class EngineClient(ABC): """Protocol class for Clients to Engine""" From a24ea5414bc5b623cde301c8c6e1c5082ecfe412 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 28 Nov 2025 03:04:58 +0800 Subject: [PATCH 043/197] [Deprecation] Advance deprecation status (#29617) Signed-off-by: DarkLight1337 --- vllm/config/scheduler.py | 15 +-------- vllm/distributed/parallel_state.py | 19 ----------- vllm/model_executor/models/utils.py | 49 ----------------------------- vllm/v1/core/sched/output.py | 4 +-- 4 files changed, 3 insertions(+), 84 deletions(-) diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index 2cf42d57ec217..ff1ac0e18f324 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -7,7 +7,7 @@ from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast from pydantic import Field, field_validator from pydantic.dataclasses import dataclass -from typing_extensions import Self, deprecated +from typing_extensions import Self from vllm.config.utils import config from vllm.logger import init_logger @@ -224,19 +224,6 @@ class SchedulerConfig: self.verify_max_model_len(max_model_len) - @property - @deprecated( - "`SchedulerConfig.chunked_prefill_enabled` has been renamed to " - "`SchedulerConfig.enable_chunked_prefill`. " - "The old name will be removed in v0.12." - ) - def chunked_prefill_enabled(self) -> bool: - return self.enable_chunked_prefill - - @chunked_prefill_enabled.setter - def chunked_prefill_enabled(self, value: bool): - self.enable_chunked_prefill = value - def verify_max_model_len(self, max_model_len: int) -> Self: if ( self.max_num_batched_tokens < max_model_len diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 69c28e278f2d2..52b433cfaf1bd 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -41,7 +41,6 @@ import torch.distributed import torch.distributed._functional_collectives as funcol import torch.distributed._symmetric_memory from torch.distributed import Backend, ProcessGroup -from typing_extensions import deprecated import vllm.envs as envs from vllm.distributed.device_communicators.base_device_communicator import ( @@ -1078,15 +1077,6 @@ def get_tp_group() -> GroupCoordinator: return _TP -@deprecated( - "`get_tensor_model_parallel_group` has been replaced with " - "`get_tp_group` and may be removed after v0.12. Please use " - "`get_tp_group` instead." -) -def get_tensor_model_parallel_group(): - return get_tp_group() - - _DCP: GroupCoordinator | None = None @@ -1130,15 +1120,6 @@ def get_pcp_group() -> GroupCoordinator: return _PCP -@deprecated( - "`get_pipeline_model_parallel_group` has been replaced with " - "`get_pp_group` and may be removed in v0.12. Please use " - "`get_pp_group` instead." -) -def get_pipeline_model_parallel_group(): - return get_pp_group() - - @contextmanager def graph_capture(device: torch.device): """ diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index ccefd7e66697f..f25ab9153a50d 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -10,7 +10,6 @@ import torch import torch.nn as nn from torch.func import functional_call from transformers import PretrainedConfig -from typing_extensions import deprecated from vllm.config import VllmConfig from vllm.distributed import ( @@ -481,54 +480,6 @@ def _merge_multimodal_embeddings( return inputs_embeds -@deprecated( - "`merge_multimodal_embeddings` has been replaced with " - "`SupportsMultiModal.embed_input_ids` and will be " - "removed in v0.12." -) -def merge_multimodal_embeddings( - input_ids: torch.Tensor, - inputs_embeds: torch.Tensor, - multimodal_embeddings: NestedTensors, - placeholder_token_id: int | list[int], -) -> torch.Tensor: - """ - Merge `multimodal_embeddings` into `inputs_embeds` by overwriting the - positions in `inputs_embeds` corresponding to placeholder tokens in - `input_ids`. - - `placeholder_token_id` can be a list of token ids (e.g, token ids - of img_start, img_break, and img_end tokens) when needed: This means - the order of these tokens in the `input_ids` MUST MATCH the order of - their embeddings in `multimodal_embeddings` since we need to - slice-merge instead of individually scattering. - - For example, if input_ids is "TTTTTSIIIBIIIBIIIETTT", where - - T is text token - - S is image start token - - I is image embedding token - - B is image break token - - E is image end token. - - Then the image embeddings (that correspond to I's) from vision encoder - must be padded with embeddings of S, B, and E in the same order of - input_ids for a correct embedding merge. - - Note: - This updates `inputs_embeds` in place. - """ - if isinstance(placeholder_token_id, list): - is_multimodal = isin_list(input_ids, placeholder_token_id) - else: - is_multimodal = input_ids == placeholder_token_id - - return _merge_multimodal_embeddings( - inputs_embeds, - multimodal_embeddings=multimodal_embeddings, - is_multimodal=is_multimodal, - ) - - def isin_list( elements: torch.Tensor, test_elements_list: list[int], diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py index 7902513dce49a..abfab43499b2a 100644 --- a/vllm/v1/core/sched/output.py +++ b/vllm/v1/core/sched/output.py @@ -126,12 +126,12 @@ class CachedRequestData: return len(self.req_ids) @cached_property - @deprecated("use resumed_req_ids field") + @deprecated("This will be removed in v0.14, use `resumed_req_ids` instead.") def resumed_from_preemption(self) -> list[bool]: return [req_id in self.resumed_req_ids for req_id in self.req_ids] @cached_property - @deprecated("use all_token_ids field") + @deprecated("This will be removed in v0.14, use `all_token_ids` instead.") def resumed_req_token_ids(self) -> list[list[int] | None]: return [ self.all_token_ids[req_id] if req_id in self.resumed_req_ids else None From 38658ec6f3b3a09a6cd205bab23a550b3d3f8c0e Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Fri, 28 Nov 2025 03:17:37 +0800 Subject: [PATCH 044/197] [Bugfix][MM encoder] Fix ViT attention backend resolving for Turing GPU (#29614) Signed-off-by: Isotr0py --- vllm/platforms/cuda.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index d5c3a177d9c2b..4bf9401b6b051 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -264,14 +264,15 @@ class CudaPlatformBase(Platform): cls, head_size: int, dtype: torch.dtype ) -> "AttentionBackendEnum": # Try FlashAttention first - try: - backend_class = AttentionBackendEnum.FLASH_ATTN.get_class() - if backend_class.supports_head_size( - head_size - ) and backend_class.supports_dtype(dtype): - return AttentionBackendEnum.FLASH_ATTN - except ImportError: - pass + if (cc := cls.get_device_capability()) and cc.major >= 8: + try: + backend_class = AttentionBackendEnum.FLASH_ATTN.get_class() + if backend_class.supports_head_size( + head_size + ) and backend_class.supports_dtype(dtype): + return AttentionBackendEnum.FLASH_ATTN + except ImportError: + pass return AttentionBackendEnum.TORCH_SDPA From e5a621b724e5570aaffc4bbf9c5f6ec9bca63333 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Thu, 27 Nov 2025 20:31:52 +0100 Subject: [PATCH 045/197] [CI] Add batched audios Whisper test (#29308) Signed-off-by: NickLucche --- .../openai/test_transcription_validation.py | 197 +-------------- .../test_transcription_validation_whisper.py | 237 ++++++++++++++++++ 2 files changed, 238 insertions(+), 196 deletions(-) create mode 100644 tests/entrypoints/openai/test_transcription_validation_whisper.py diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py index 88580ed899f1a..8045ab1468d6a 100644 --- a/tests/entrypoints/openai/test_transcription_validation.py +++ b/tests/entrypoints/openai/test_transcription_validation.py @@ -2,20 +2,12 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project # imports for structured outputs tests -import io import json -import librosa -import numpy as np -import openai import pytest -import pytest_asyncio -import soundfile as sf from ...utils import RemoteOpenAIServer -MODEL_NAME = "openai/whisper-large-v3-turbo" -SERVER_ARGS = ["--enforce-eager"] MISTRAL_FORMAT_ARGS = [ "--tokenizer_mode", "mistral", @@ -26,22 +18,8 @@ MISTRAL_FORMAT_ARGS = [ ] -@pytest.fixture(scope="module") -def server(): - with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server: - yield remote_server - - -@pytest_asyncio.fixture -async def client(server): - async with server.get_async_client() as async_client: - yield async_client - - @pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", ["openai/whisper-large-v3-turbo", "mistralai/Voxtral-Mini-3B-2507"] -) +@pytest.mark.parametrize("model_name", ["mistralai/Voxtral-Mini-3B-2507"]) async def test_basic_audio(mary_had_lamb, model_name): server_args = ["--enforce-eager"] @@ -120,176 +98,3 @@ async def test_basic_audio_gemma(foscolo): ) out = json.loads(transcription)["text"] assert "da cui vergine nacque Venere" in out - - -@pytest.mark.asyncio -async def test_non_asr_model(winning_call): - # text to text model - model_name = "JackFram/llama-68m" - with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server: - client = remote_server.get_async_client() - res = await client.audio.transcriptions.create( - model=model_name, file=winning_call, language="en", temperature=0.0 - ) - err = res.error - assert err["code"] == 400 and not res.text - assert err["message"] == "The model does not support Transcriptions API" - - -@pytest.mark.asyncio -async def test_bad_requests(mary_had_lamb, client): - # invalid language - with pytest.raises(openai.BadRequestError): - await client.audio.transcriptions.create( - model=MODEL_NAME, file=mary_had_lamb, language="hh", temperature=0.0 - ) - - -@pytest.mark.asyncio -async def test_long_audio_request(mary_had_lamb, client): - mary_had_lamb.seek(0) - audio, sr = librosa.load(mary_had_lamb) - # Add small silence after each audio for repeatability in the split process - audio = np.pad(audio, (0, 1600)) - repeated_audio = np.tile(audio, 10) - # Repeated audio to buffer - buffer = io.BytesIO() - sf.write(buffer, repeated_audio, sr, format="WAV") - buffer.seek(0) - transcription = await client.audio.transcriptions.create( - model=MODEL_NAME, - file=buffer, - language="en", - response_format="text", - temperature=0.0, - ) - out = json.loads(transcription) - out_text = out["text"] - out_usage = out["usage"] - counts = out_text.count("Mary had a little lamb") - assert counts == 10, counts - assert out_usage["seconds"] == 161, out_usage["seconds"] - - -@pytest.mark.asyncio -async def test_completion_endpoints(client): - # text to text model - res = await client.chat.completions.create( - model=MODEL_NAME, - messages=[{"role": "system", "content": "You are a helpful assistant."}], - ) - err = res.error - assert err["code"] == 400 - assert err["message"] == "The model does not support Chat Completions API" - - res = await client.completions.create(model=MODEL_NAME, prompt="Hello") - err = res.error - assert err["code"] == 400 - assert err["message"] == "The model does not support Completions API" - - -@pytest.mark.asyncio -async def test_streaming_response(winning_call, client): - transcription = "" - res_no_stream = await client.audio.transcriptions.create( - model=MODEL_NAME, - file=winning_call, - response_format="json", - language="en", - temperature=0.0, - ) - res = await client.audio.transcriptions.create( - model=MODEL_NAME, - file=winning_call, - language="en", - temperature=0.0, - stream=True, - timeout=30, - ) - # Reconstruct from chunks and validate - async for chunk in res: - text = chunk.choices[0]["delta"]["content"] - transcription += text - - assert transcription == res_no_stream.text - - -@pytest.mark.asyncio -async def test_stream_options(winning_call, client): - res = await client.audio.transcriptions.create( - model=MODEL_NAME, - file=winning_call, - language="en", - temperature=0.0, - stream=True, - extra_body=dict(stream_include_usage=True, stream_continuous_usage_stats=True), - timeout=30, - ) - final = False - continuous = True - async for chunk in res: - if not len(chunk.choices): - # final usage sent - final = True - else: - continuous = continuous and hasattr(chunk, "usage") - assert final and continuous - - -@pytest.mark.asyncio -async def test_sampling_params(mary_had_lamb, client): - """ - Compare sampling with params and greedy sampling to assert results - are different when extreme sampling parameters values are picked. - """ - transcription = await client.audio.transcriptions.create( - model=MODEL_NAME, - file=mary_had_lamb, - language="en", - temperature=0.8, - extra_body=dict( - seed=42, - repetition_penalty=1.9, - top_k=12, - top_p=0.4, - min_p=0.5, - frequency_penalty=1.8, - presence_penalty=2.0, - ), - ) - - greedy_transcription = await client.audio.transcriptions.create( - model=MODEL_NAME, - file=mary_had_lamb, - language="en", - temperature=0.0, - extra_body=dict(seed=42), - ) - - assert greedy_transcription.text != transcription.text - - -@pytest.mark.asyncio -async def test_audio_prompt(mary_had_lamb, client): - prompt = "This is a speech, recorded in a phonograph." - # Prompts should not omit the part of original prompt while transcribing. - prefix = "The first words I spoke in the original phonograph" - transcription = await client.audio.transcriptions.create( - model=MODEL_NAME, - file=mary_had_lamb, - language="en", - response_format="text", - temperature=0.0, - ) - out = json.loads(transcription)["text"] - assert prefix in out - transcription_wprompt = await client.audio.transcriptions.create( - model=MODEL_NAME, - file=mary_had_lamb, - language="en", - response_format="text", - prompt=prompt, - temperature=0.0, - ) - out_prompt = json.loads(transcription_wprompt)["text"] - assert prefix in out_prompt diff --git a/tests/entrypoints/openai/test_transcription_validation_whisper.py b/tests/entrypoints/openai/test_transcription_validation_whisper.py new file mode 100644 index 0000000000000..82c50e58a0168 --- /dev/null +++ b/tests/entrypoints/openai/test_transcription_validation_whisper.py @@ -0,0 +1,237 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# imports for structured outputs tests +import asyncio +import io +import json + +import librosa +import numpy as np +import openai +import pytest +import pytest_asyncio +import soundfile as sf + +from ...utils import RemoteOpenAIServer + +MODEL_NAME = "openai/whisper-large-v3-turbo" +SERVER_ARGS = ["--enforce-eager"] + + +@pytest.fixture(scope="module") +def server(): + with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def whisper_client(server): + async with server.get_async_client() as async_client: + yield async_client + + +@pytest.mark.asyncio +async def test_basic_audio(mary_had_lamb): + server_args = ["--enforce-eager"] + + # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb. + with RemoteOpenAIServer(MODEL_NAME, server_args) as remote_server: + client = remote_server.get_async_client() + transcription = await client.audio.transcriptions.create( + model=MODEL_NAME, + file=mary_had_lamb, + language="en", + response_format="text", + temperature=0.0, + ) + out = json.loads(transcription) + out_text = out["text"] + out_usage = out["usage"] + assert "Mary had a little lamb," in out_text + assert out_usage["seconds"] == 16, out_usage["seconds"] + + +@pytest.mark.asyncio +async def test_basic_audio_batched(mary_had_lamb, winning_call, whisper_client): + transcription = whisper_client.audio.transcriptions.create( + model=MODEL_NAME, + file=mary_had_lamb, + language="en", + response_format="text", + temperature=0.0, + ) + transcription2 = whisper_client.audio.transcriptions.create( + model=MODEL_NAME, + file=winning_call, + language="en", + response_format="text", + temperature=0.0, + ) + # Await both transcriptions by scheduling coroutines together + transcription, transcription2 = await asyncio.gather(transcription, transcription2) + out = json.loads(transcription) + out_text = out["text"] + assert "Mary had a little lamb," in out_text + out2 = json.loads(transcription2) + out_text2 = out2["text"] + assert "Edgar Martinez" in out_text2 + + +@pytest.mark.asyncio +async def test_bad_requests(mary_had_lamb, whisper_client): + # invalid language + with pytest.raises(openai.BadRequestError): + await whisper_client.audio.transcriptions.create( + model=MODEL_NAME, file=mary_had_lamb, language="hh", temperature=0.0 + ) + + +@pytest.mark.asyncio +async def test_long_audio_request(mary_had_lamb, whisper_client): + mary_had_lamb.seek(0) + audio, sr = librosa.load(mary_had_lamb) + # Add small silence after each audio for repeatability in the split process + audio = np.pad(audio, (0, 1600)) + repeated_audio = np.tile(audio, 10) + # Repeated audio to buffer + buffer = io.BytesIO() + sf.write(buffer, repeated_audio, sr, format="WAV") + buffer.seek(0) + transcription = await whisper_client.audio.transcriptions.create( + model=MODEL_NAME, + file=buffer, + language="en", + response_format="text", + temperature=0.0, + ) + out = json.loads(transcription) + out_text = out["text"] + out_usage = out["usage"] + counts = out_text.count("Mary had a little lamb") + assert counts == 10, counts + assert out_usage["seconds"] == 161, out_usage["seconds"] + + +@pytest.mark.asyncio +async def test_completion_endpoints(whisper_client): + # text to text model + res = await whisper_client.chat.completions.create( + model=MODEL_NAME, + messages=[{"role": "system", "content": "You are a helpful assistant."}], + ) + err = res.error + assert err["code"] == 400 + assert err["message"] == "The model does not support Chat Completions API" + + res = await whisper_client.completions.create(model=MODEL_NAME, prompt="Hello") + err = res.error + assert err["code"] == 400 + assert err["message"] == "The model does not support Completions API" + + +@pytest.mark.asyncio +async def test_streaming_response(winning_call, whisper_client): + transcription = "" + res_no_stream = await whisper_client.audio.transcriptions.create( + model=MODEL_NAME, + file=winning_call, + response_format="json", + language="en", + temperature=0.0, + ) + res = await whisper_client.audio.transcriptions.create( + model=MODEL_NAME, + file=winning_call, + language="en", + temperature=0.0, + stream=True, + timeout=30, + ) + # Reconstruct from chunks and validate + async for chunk in res: + text = chunk.choices[0]["delta"]["content"] + transcription += text + + assert transcription == res_no_stream.text + + +@pytest.mark.asyncio +async def test_stream_options(winning_call, whisper_client): + res = await whisper_client.audio.transcriptions.create( + model=MODEL_NAME, + file=winning_call, + language="en", + temperature=0.0, + stream=True, + extra_body=dict(stream_include_usage=True, stream_continuous_usage_stats=True), + timeout=30, + ) + final = False + continuous = True + async for chunk in res: + if not len(chunk.choices): + # final usage sent + final = True + else: + continuous = continuous and hasattr(chunk, "usage") + assert final and continuous + + +@pytest.mark.asyncio +async def test_sampling_params(mary_had_lamb, whisper_client): + """ + Compare sampling with params and greedy sampling to assert results + are different when extreme sampling parameters values are picked. + """ + transcription = await whisper_client.audio.transcriptions.create( + model=MODEL_NAME, + file=mary_had_lamb, + language="en", + temperature=0.8, + extra_body=dict( + seed=42, + repetition_penalty=1.9, + top_k=12, + top_p=0.4, + min_p=0.5, + frequency_penalty=1.8, + presence_penalty=2.0, + ), + ) + + greedy_transcription = await whisper_client.audio.transcriptions.create( + model=MODEL_NAME, + file=mary_had_lamb, + language="en", + temperature=0.0, + extra_body=dict(seed=42), + ) + + assert greedy_transcription.text != transcription.text + + +@pytest.mark.asyncio +async def test_audio_prompt(mary_had_lamb, whisper_client): + prompt = "This is a speech, recorded in a phonograph." + # Prompts should not omit the part of original prompt while transcribing. + prefix = "The first words I spoke in the original phonograph" + transcription = await whisper_client.audio.transcriptions.create( + model=MODEL_NAME, + file=mary_had_lamb, + language="en", + response_format="text", + temperature=0.0, + ) + out = json.loads(transcription)["text"] + assert prefix in out + transcription_wprompt = await whisper_client.audio.transcriptions.create( + model=MODEL_NAME, + file=mary_had_lamb, + language="en", + response_format="text", + prompt=prompt, + temperature=0.0, + ) + out_prompt = json.loads(transcription_wprompt)["text"] + assert prefix in out_prompt From a5345bf49df74cd394a07797649f51cd67c6c697 Mon Sep 17 00:00:00 2001 From: Andrii Skliar Date: Thu, 27 Nov 2025 20:34:59 +0100 Subject: [PATCH 046/197] [BugFix] Fix `plan` API Mismatch when using latest FlashInfer (#29426) Signed-off-by: Andrii Skliar Co-authored-by: Andrii Skliar --- docker/Dockerfile | 4 ++-- requirements/cuda.txt | 2 +- vllm/v1/attention/backends/flashinfer.py | 3 ++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index aa3aad21d6c07..eb7c105071c00 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -398,8 +398,8 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist # Install FlashInfer pre-compiled kernel cache and binaries # https://docs.flashinfer.ai/installation.html RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system flashinfer-cubin==0.5.2 \ - && uv pip install --system flashinfer-jit-cache==0.5.2 \ + uv pip install --system flashinfer-cubin==0.5.3 \ + && uv pip install --system flashinfer-jit-cache==0.5.3 \ --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \ && flashinfer show-config diff --git a/requirements/cuda.txt b/requirements/cuda.txt index 15e8aadc56f47..462f18ef7159b 100644 --- a/requirements/cuda.txt +++ b/requirements/cuda.txt @@ -10,4 +10,4 @@ torchaudio==2.9.0 # These must be updated alongside torch torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version # FlashInfer should be updated together with the Dockerfile -flashinfer-python==0.5.2 +flashinfer-python==0.5.3 diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index dbd72b298b1fd..777398bf8a20e 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -1508,7 +1508,7 @@ def fast_plan_decode( qo_indptr_host = _get_range_buf(batch_size + 1, "cpu") try: - # Make sure we pass exactly 18 arguments for tensor core version + # Make sure we pass exactly 19 arguments for tensor core version self._plan_info = self._cached_module.plan( self._float_workspace_buffer, self._int_workspace_buffer, @@ -1528,6 +1528,7 @@ def fast_plan_decode( window_left, fixed_split_size, disable_split_kv, + 0, ) except Exception as e: raise RuntimeError(f"Error in tensor core plan: {e}") from e From ae0ce1be272105f02a3ac6a63e646690be2481fb Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 27 Nov 2025 12:38:53 -0800 Subject: [PATCH 047/197] [Model Runner V2][BugFix] Keep reference to GPU tensors in AsyncOutput (#29623) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu/async_utils.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu/async_utils.py b/vllm/v1/worker/gpu/async_utils.py index 421fb29a7f87f..f6bc607c1ae67 100644 --- a/vllm/v1/worker/gpu/async_utils.py +++ b/vllm/v1/worker/gpu/async_utils.py @@ -21,6 +21,9 @@ class AsyncOutput(AsyncModelRunnerOutput): copy_stream: torch.cuda.Stream, copy_event: torch.cuda.Event, ): + # NOTE(woosuk): We must retain references to the GPU tensors, + # as the copy operations are performed on a different CUDA stream than + # the one where the tensors were created. self.model_runner_output = model_runner_output self.sampler_output = sampler_output self.num_sampled_tokens = num_sampled_tokens @@ -51,7 +54,9 @@ class AsyncOutput(AsyncModelRunnerOutput): ) else: self.logprobs_tensors = None - self.num_sampled_tokens = num_sampled_tokens.to("cpu", non_blocking=True) + self.num_sampled_tokens_cpu = num_sampled_tokens.to( + "cpu", non_blocking=True + ) self.prompt_logprobs_dict: dict[str, LogprobsTensors | None] = {} if self.model_runner_output.prompt_logprobs_dict: for k, v in self.model_runner_output.prompt_logprobs_dict.items(): @@ -63,7 +68,7 @@ class AsyncOutput(AsyncModelRunnerOutput): def get_output(self) -> ModelRunnerOutput: self.copy_event.synchronize() - num_sampled_tokens_np = self.num_sampled_tokens.numpy() + num_sampled_tokens_np = self.num_sampled_tokens_cpu.numpy() # NOTE(woosuk): The following code is to ensure compatibility with # the existing model runner. From be493e0b3cfb5810d254e9845217878a39a4853b Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Thu, 27 Nov 2025 16:45:38 -0500 Subject: [PATCH 048/197] [BugFix] Fix new nightly failures (#29578) Signed-off-by: Lucas Wilkinson --- vllm/v1/attention/backends/utils.py | 26 ++++++++++++++++++++++++++ vllm/v1/worker/gpu_model_runner.py | 12 +++++++++++- 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 6e0d84e4fb4ac..27f07218d9b2e 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -100,6 +100,32 @@ class CommonAttentionMetadata: dcp_local_seq_lens_cpu: torch.Tensor | None = None """Sequence lengths of the local rank in decode context parallelism world""" + # TODO(lucas): remove once we have FULL-CG spec-decode support + def unpadded( + self, num_actual_tokens: int, num_actual_reqs: int + ) -> "CommonAttentionMetadata": + maybe_slice_reqs = lambda x: x[:num_actual_reqs] if x is not None else None + return CommonAttentionMetadata( + query_start_loc=self.query_start_loc[: num_actual_reqs + 1], + query_start_loc_cpu=self.query_start_loc_cpu[: num_actual_reqs + 1], + seq_lens=self.seq_lens[:num_actual_reqs], + seq_lens_cpu=self.seq_lens_cpu[:num_actual_reqs], + num_computed_tokens_cpu=self.num_computed_tokens_cpu[:num_actual_reqs], + num_reqs=num_actual_reqs, + num_actual_tokens=num_actual_tokens, + max_query_len=self.max_query_len, + max_seq_len=self.max_seq_len, + block_table_tensor=self.block_table_tensor[:num_actual_reqs], + slot_mapping=self.slot_mapping[:num_actual_tokens], + causal=self.causal, + logits_indices_padded=self.logits_indices_padded, + num_logits_indices=self.num_logits_indices, + encoder_seq_lens=maybe_slice_reqs(self.encoder_seq_lens), + encoder_seq_lens_cpu=maybe_slice_reqs(self.encoder_seq_lens_cpu), + dcp_local_seq_lens=maybe_slice_reqs(self.dcp_local_seq_lens), + dcp_local_seq_lens_cpu=maybe_slice_reqs(self.dcp_local_seq_lens_cpu), + ) + def slice_query_start_locs( query_start_loc: torch.Tensor, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 0ae4eb48acf22..6bff83658b45a 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1551,7 +1551,7 @@ class GPUModelRunner( # Encoder-only layers do not have KV cache, so we need to # create a dummy block table and slot mapping for them. blk_table_tensor = torch.zeros( - (num_tokens_padded, 1), + (num_reqs_padded, 1), dtype=torch.int32, device=self.device, ) @@ -1652,6 +1652,16 @@ class GPUModelRunner( for layer_name in attn_group.layer_names: attn_metadata[layer_name] = attn_metadata_i + if spec_decode_common_attn_metadata is not None and ( + num_reqs != num_reqs_padded or num_tokens != num_tokens_padded + ): + # Currently the drafter still only uses piecewise cudagraphs (and modifies + # the attention metadata in directly), and therefore does not want to use + # padded attention metadata. + spec_decode_common_attn_metadata = ( + spec_decode_common_attn_metadata.unpadded(num_tokens, num_reqs) + ) + return attn_metadata, spec_decode_common_attn_metadata def _compute_cascade_attn_prefix_lens( From 35657bcd7a5fd7a7af1aa1b19d78eb8973ec79c1 Mon Sep 17 00:00:00 2001 From: scydas Date: Fri, 28 Nov 2025 09:34:33 +0800 Subject: [PATCH 049/197] [CPU]Update CPU PyTorch to 2.9.0 (#29589) Signed-off-by: scyda Co-authored-by: Li, Jiang --- docker/Dockerfile.cpu | 4 ---- requirements/cpu-build.txt | 4 ++-- requirements/cpu.txt | 8 ++++---- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index eb3807ef0ca4e..67d3fb83a0275 100644 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -119,7 +119,6 @@ FROM base AS vllm-test-deps WORKDIR /workspace/vllm -# TODO: Update to 2.9.0 when there is a new build for intel_extension_for_pytorch for that version RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \ cp requirements/test.in requirements/cpu-test.in && \ sed -i '/mamba_ssm/d' requirements/cpu-test.in && \ @@ -132,9 +131,6 @@ RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \ esac; \ }; \ remove_packages_not_supported_on_aarch64 && \ - sed -i 's/^torch==.*/torch==2.8.0/g' requirements/cpu-test.in && \ - sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \ - sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \ uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu RUN --mount=type=cache,target=/root/.cache/uv \ diff --git a/requirements/cpu-build.txt b/requirements/cpu-build.txt index 81d429a5e5f8d..0c6fdd3b33cd1 100644 --- a/requirements/cpu-build.txt +++ b/requirements/cpu-build.txt @@ -4,9 +4,9 @@ packaging>=24.2 setuptools>=77.0.3,<81.0.0 setuptools-scm>=8 --extra-index-url https://download.pytorch.org/whl/cpu -torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x" +torch==2.9.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x" torch==2.9.0; platform_system == "Darwin" -torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64" +torch==2.9.0; platform_machine == "ppc64le" or platform_machine == "aarch64" scons; platform_machine == "aarch64" # needed to build Arm Compute Library (ACL) wheel jinja2>=3.1.6 diff --git a/requirements/cpu.txt b/requirements/cpu.txt index e23d3286f3f78..8c04d6d5ce1b0 100644 --- a/requirements/cpu.txt +++ b/requirements/cpu.txt @@ -7,17 +7,17 @@ numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative d packaging>=24.2 setuptools>=77.0.3,<81.0.0 --extra-index-url https://download.pytorch.org/whl/cpu -torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x" +torch==2.9.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x" torch==2.9.0; platform_system == "Darwin" -torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64" +torch==2.9.0; platform_machine == "ppc64le" or platform_machine == "aarch64" # required for the image processor of minicpm-o-2_6, this must be updated alongside torch torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x" -torchaudio==2.8.0; platform_machine == "ppc64le" +torchaudio==2.9.0; platform_machine == "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch torchvision; platform_machine != "ppc64le" and platform_machine != "s390x" -torchvision==0.23.0; platform_machine == "ppc64le" +torchvision==0.24.0; platform_machine == "ppc64le" datasets # for benchmark scripts # Intel Extension for PyTorch, only for x86_64 CPUs From 745a3bae1aef2ff3aa70b3eab8624e4571698ba0 Mon Sep 17 00:00:00 2001 From: Xin Yang <105740670+xyang16@users.noreply.github.com> Date: Thu, 27 Nov 2025 18:48:28 -0800 Subject: [PATCH 050/197] [LoRA] Support FusedMoE LoRA Triton kernel for mxfp4 (#28971) Signed-off-by: Xin Yang Co-authored-by: Jee Jee Li --- .../moe/test_modular_oai_triton_moe.py | 250 ++++++++++++++++++ vllm/lora/layers/fused_moe.py | 37 ++- .../fused_moe/gpt_oss_triton_kernels_moe.py | 146 ++++++++++ .../layers/quantization/mxfp4.py | 20 +- 4 files changed, 441 insertions(+), 12 deletions(-) create mode 100644 tests/kernels/moe/test_modular_oai_triton_moe.py diff --git a/tests/kernels/moe/test_modular_oai_triton_moe.py b/tests/kernels/moe/test_modular_oai_triton_moe.py new file mode 100644 index 0000000000000..3361d85e92507 --- /dev/null +++ b/tests/kernels/moe/test_modular_oai_triton_moe.py @@ -0,0 +1,250 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Test modular OAI Triton MoE +""" + +import pytest +import torch + +from vllm.utils.import_utils import has_triton_kernels + +if not has_triton_kernels(): + pytest.skip( + "triton_kernels not found, skipping all related tests", + allow_module_level=True, + ) + +from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig +from triton_kernels.numerics import InFlexData +from triton_kernels.numerics_details.mxfp import downcast_to_mxfp, upcast_from_mxfp +from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor +from triton_kernels.tensor_details import layout +from triton_kernels.testing import assert_close + +from vllm.config import VllmConfig, set_current_vllm_config +from vllm.model_executor.layers.fused_moe.config import mxfp4_w4a16_moe_quant_config +from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import ( + OAITritonExperts, + UnfusedOAITritonExperts, +) +from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel +from vllm.model_executor.layers.fused_moe.prepare_finalize import ( + MoEPrepareAndFinalizeNoEP, +) +from vllm.model_executor.layers.utils import shuffle_weight +from vllm.platforms import current_platform + +MNK = [ + (1, 512, 384), + (1, 2880, 2880), + (2, 512, 384), + (2, 2880, 2880), + (32, 2880, 2880), + (64, 2880, 2880), +] + + +def unshuffle_weight(w: torch.Tensor): + first = w[..., ::2] + second = w[..., 1::2] + return torch.concat((first, second), dim=-1) + + +def make_weights(dtype, k, n, e): + w1 = torch.randn((e, k, 2 * n), dtype=dtype, device="cuda") + w1_bias = torch.randn((e, 2 * n), dtype=dtype, device="cuda") + + w2 = torch.randn((e, n, k), dtype=dtype, device="cuda") + w2_bias = torch.randn((e, k), dtype=dtype, device="cuda") + + w1_tri = w1.clone() + w2_tri = w2.clone() + + w1_bias_tri = w1_bias.clone() + w2_bias_tri = w2_bias.clone() + w1_bias_tri = w1_bias_tri.to(torch.float32) + w2_bias_tri = w2_bias_tri.to(torch.float32) + + # shuffle weights + w1_tri = shuffle_weight(w1_tri) + w1_bias_tri = shuffle_weight(w1_bias_tri) + + # quant triton_weights + w1_tri, w1_scale_tri = downcast_to_mxfp(w1_tri, torch.uint8, axis=1) + w1 = upcast_from_mxfp(w1_tri, w1_scale_tri, dtype, axis=1) + w1 = unshuffle_weight(w1) + + w2_tri, w2_scale_tri = downcast_to_mxfp(w2_tri, torch.uint8, axis=1) + w2 = upcast_from_mxfp(w2_tri, w2_scale_tri, dtype, axis=1) + + num_warps = 8 + w_layout, w_layout_opts = layout.make_default_matmul_mxfp4_w_layout(mx_axis=1) + w_scale_layout, w_scale_layout_opts = ( + layout.make_default_matmul_mxfp4_w_scale_layout(mx_axis=1, num_warps=num_warps) + ) + + w1_tri = convert_layout(wrap_torch_tensor(w1_tri, FP4), w_layout, **w_layout_opts) + w1_scale_tri = convert_layout( + wrap_torch_tensor(w1_scale_tri), + w_scale_layout, + **w_scale_layout_opts, + ) + + w2_tri = convert_layout(wrap_torch_tensor(w2_tri, FP4), w_layout, **w_layout_opts) + w2_scale_tri = convert_layout( + wrap_torch_tensor(w2_scale_tri), + w_scale_layout, + **w_scale_layout_opts, + ) + + w1_precision_config = PrecisionConfig( + weight_scale=w1_scale_tri, flex_ctx=FlexCtx(rhs_data=InFlexData()) + ) + w2_precision_config = PrecisionConfig( + weight_scale=w2_scale_tri, flex_ctx=FlexCtx(rhs_data=InFlexData()) + ) + + return ( + w1, + w2, + w1_bias, + w2_bias, + w1_tri, + w2_tri, + w1_bias_tri, + w2_bias_tri, + w1_precision_config, + w2_precision_config, + ) + + +def swiglu(x, alpha: float = 1.702, limit: float = 1.0): + # Note we add an extra bias of 1 to the linear layer + x_glu, x_linear = torch.chunk(x, 2, dim=-1) + if limit is not None: + x_glu = x_glu.clamp(max=limit) + out_glu = x_glu * torch.sigmoid(alpha * x_glu) + if limit is not None: + x_linear = x_linear.clamp(min=-limit, max=limit) + return out_glu * (x_linear + 1) + + +def torch_moe_impl( + hidden_states: torch.Tensor, # (M, K) + w1: torch.Tensor, # (E, K, 2N) + w2: torch.Tensor, # (E, N, K) + w1_bias: torch.Tensor, # (E, 2N) + w2_bias: torch.Tensor, # (E, K) + topk_weights: torch.Tensor, # (M, topk) + topk_ids: torch.Tensor, # (M, topk) +): + w1 = w1[topk_ids, ...] + w1_bias = w1_bias[topk_ids, ...] + hidden_states = torch.einsum("bekc,bk->bec", w1, hidden_states) + w1_bias + hidden_states = swiglu(hidden_states, limit=7) + + w2 = w2[topk_ids, ...] + w2_bias = w2_bias[topk_ids, ...] + hidden_states = torch.einsum("bekc,bek->bec", w2, hidden_states) + w2_bias + + # Weighted sum of experts + hidden_states = torch.einsum("bec,be->bc", hidden_states, topk_weights) + return hidden_states + + +def oai_triton_moe_impl( + x: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + w1_scale: "PrecisionConfig", + w2_scale: "PrecisionConfig", + w1_bias: torch.Tensor | None, + w2_bias: torch.Tensor | None, + num_experts: int, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + unfused: bool = False, +) -> torch.Tensor: + quant_config = mxfp4_w4a16_moe_quant_config( + w1_bias=w1_bias, + w2_bias=w2_bias, + w1_scale=w1_scale, + w2_scale=w2_scale, + ) + + if unfused: + fused_experts = UnfusedOAITritonExperts(quant_config) + else: + fused_experts = OAITritonExperts(quant_config) + + mk = FusedMoEModularKernel(MoEPrepareAndFinalizeNoEP(), fused_experts) + + return mk.forward( + hidden_states=x, + w1=w1, + w2=w2, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + activation="swigluoai", + global_num_experts=num_experts, + expert_map=None, + apply_router_weight_on_input=False, + ) + + +@pytest.mark.skipif( + not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform." +) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) +@pytest.mark.parametrize("m,n,k", MNK) +@pytest.mark.parametrize("num_experts", [32, 128]) +@pytest.mark.parametrize("topk", [4]) +@pytest.mark.parametrize("unfused", [True, False]) +def test_oai_triton_moe( + dtype: torch.dtype, + m: int, + n: int, + k: int, + num_experts: int, + topk: int, + unfused: bool, +): + current_platform.seed_everything(0) + ( + w1, + w2, + w1_bias, + w2_bias, + w1_tri, + w2_tri, + w1_bias_tri, + w2_bias_tri, + w1_precision_config, + w2_precision_config, + ) = make_weights(dtype, k, n, num_experts) + + x = torch.randn((m, k), dtype=dtype, device="cuda") + router_logits = torch.randn(m, num_experts, device="cuda", dtype=dtype) + topk_weights, topk_ids = torch.topk(router_logits, k=topk, dim=-1, sorted=True) + topk_weights = torch.nn.functional.softmax(topk_weights, dim=-1) + + with set_current_vllm_config(VllmConfig()): + out_ref = torch_moe_impl(x, w1, w2, w1_bias, w2_bias, topk_weights, topk_ids) + + out = oai_triton_moe_impl( + x, + w1_tri, + w2_tri, + w1_precision_config, + w2_precision_config, + w1_bias_tri, + w2_bias_tri, + num_experts, + topk_weights, + topk_ids, + unfused, + ) + + assert_close(ref=out_ref, tri=out, maxtol=0.025, rmstol=0.005) diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index 3ad19370962ab..24cab79a72443 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -20,15 +20,24 @@ from vllm.model_executor.layers.fused_moe.config import ( _get_config_dtype_str, ) from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( - modular_marlin_fused_moe, + MarlinExperts, ) from vllm.model_executor.layers.fused_moe.fused_moe import ( - modular_triton_fused_moe, + TritonExperts, try_get_optimal_moe_config, ) from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import ( FusedMoEModularMethod, ) +from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import ( + UnfusedOAITritonExperts, +) +from vllm.model_executor.layers.fused_moe.modular_kernel import ( + FusedMoEModularKernel, +) +from vllm.model_executor.layers.fused_moe.prepare_finalize import ( + MoEPrepareAndFinalizeNoEP, +) from .utils import _get_lora_device @@ -114,15 +123,23 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): self.base_layer.ensure_moe_quant_config_init() quant_config = self.base_layer.quant_method.moe_quant_config - m_fused_moe_fn = ( - modular_triton_fused_moe( - quant_config, shared_experts=self.base_layer.shared_experts - ) - if not quant_config.use_mxfp4_w4a16 - else modular_marlin_fused_moe( - quant_config, shared_experts=self.base_layer.shared_experts - ) + prepare_finalize = MoEPrepareAndFinalizeNoEP() + m_fused_moe_fn = FusedMoEModularKernel( + prepare_finalize, + self.base_layer.quant_method.select_gemm_impl( + prepare_finalize, self.base_layer + ), + self.base_layer.shared_experts, + getattr(self.base_layer, "shared_experts_stream", None), ) + if quant_config.use_mxfp4_w4a16: + assert isinstance( + m_fused_moe_fn.fused_experts, (MarlinExperts, UnfusedOAITritonExperts) + ) + else: + assert isinstance( + m_fused_moe_fn.fused_experts, (MarlinExperts, TritonExperts) + ) def fwd_decorator(layer, func): def wrapper(*args, **kwargs): diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py index 128507639fdfd..0b006e15632e1 100644 --- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py @@ -5,6 +5,7 @@ import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.config import ( FUSED_MOE_UNQUANTIZED_CONFIG, @@ -376,3 +377,148 @@ class OAITritonExperts(BaseOAITritonExperts): intermediate_cache=workspace2, a1q_scale=a1q_scale, ) + + +class UnfusedOAITritonExperts(BaseOAITritonExperts): + """ + A Triton based MoE expert class that operates on expert standard + format and explicitly keeps the activation and reduction (moe_sum) steps + unfused from the matmul_ogs kernel. This exposes injection points + for activation and moe_sum. + + One use case for it is to inject LoRA modules on the activation and moe_sum. + """ + + def __init__(self, quant_config: FusedMoEQuantConfig): + # TODO (varun) : Enable activation quantization + assert quant_config.use_mxfp4_w4a16, "Supports only mxfp4_w4a16" + super().__init__(quant_config) + + @property + def activation_formats( + self, + ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]: + return ( + mk.FusedMoEActivationFormat.Standard, + mk.FusedMoEActivationFormat.Standard, + ) + + def supports_chunking(self) -> bool: + return True + + def workspace_shapes( + self, + M: int, + N: int, + K: int, + topk: int, + global_num_experts: int, + local_num_experts: int, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: + # workspace are allocated inside the kernel + workspace1 = (M * topk, N // 2) + workspace2 = (M * topk, max(N, K)) + output = (M, K) + return (workspace1, workspace2, output) + + def moe_sum(self, input: torch.Tensor, output: torch.Tensor): + ops.moe_sum(input, output) + + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: torch.Tensor | None, + a1q_scale: torch.Tensor | None, + a2_scale: torch.Tensor | None, + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + apply_router_weight_on_input: bool, + ): + if self.quant_config is None: + self.quant_config = FUSED_MOE_UNQUANTIZED_CONFIG + + if expert_map is not None: + topk_ids = expert_map[topk_ids] + + local_num_experts = w1.size(0) + if global_num_experts == -1: + global_num_experts = local_num_experts + + routing_data, gather_indx, scatter_indx = self._make_routing_data( + topk_ids, topk_weights, local_num_experts + ) + + topk = topk_ids.size(1) + + # type check, uint8 means mxfp4 + assert hidden_states.dtype == torch.bfloat16 + assert ( + self.quant_config.w1_bias is None + or self.quant_config.w1_bias.dtype == torch.float32 + ) + assert ( + self.quant_config.w2_bias is None + or self.quant_config.w2_bias.dtype == torch.float32 + ) + + # Shape check, only check non-mxfp4 + assert hidden_states.ndim == 2 + assert hidden_states.shape[-1] == w1.shape[-2] + assert w2.shape[-1] == w1.shape[1] + + batch_dim = 1 + M, K = hidden_states.shape + E, _, N = w1.shape + + if global_num_experts == -1: + global_num_experts = E + + # Note that the output tensor might be in workspace13 + intermediate_cache1 = _resize_cache(workspace2, (batch_dim, M * topk, N)) + intermediate_cache3 = _resize_cache(workspace2, (batch_dim, M * topk, K)) + intermediate_cache2 = _resize_cache(workspace13, (M * topk, N // 2)) + + gammas = routing_data.gate_scal if routing_data else None + + matmul_ogs( + hidden_states, + w1, + self.quant_config.w1_bias, + routing_data, + gather_indx=gather_indx, + precision_config=self.quant_config.w1_precision, + gammas=gammas if apply_router_weight_on_input else None, + fused_activation=None, + y=intermediate_cache1, + ) + + self.activation( + activation, intermediate_cache2, intermediate_cache1.view(-1, N) + ) + + # matmul_ogs grouped reduction fuse sum across multiple experts: + # y[dst_ind // n_expts_act, :] += x[src_ind, :] + # Need to set n_expts_act to 1 to unfuse moe_sum + routing_data.n_expts_act = 1 + + matmul_ogs( + intermediate_cache2, + w2, + self.quant_config.w2_bias, + routing_data, + scatter_indx=scatter_indx, + precision_config=self.quant_config.w2_precision, + gammas=None if apply_router_weight_on_input else gammas, + y=intermediate_cache3, + ) + + self.moe_sum(intermediate_cache3.view(-1, topk, K), output) diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index bc241ac692e23..74036753496d4 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -30,6 +30,7 @@ from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( ) from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import ( OAITritonExperts, + UnfusedOAITritonExperts, ) from vllm.model_executor.layers.fused_moe.trtllm_moe import TrtLlmGenExperts from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod @@ -83,8 +84,21 @@ def get_mxfp4_backend_with_lora() -> Mxfp4Backend: if not current_platform.is_cuda(): return Mxfp4Backend.NONE - logger.info_once("[get_mxfp4_backend_with_lora] Using Marlin backend") - return Mxfp4Backend.MARLIN + # If FlashInfer is not available, try either Marlin or Triton + triton_kernels_supported = ( + has_triton_kernels() + and is_torch_equal_or_newer("2.8.0") + # NOTE: triton_kernels are only confirmed to work on SM90 and SM100 + # SM110 fails with this error: https://github.com/vllm-project/vllm/issues/29317 + # SM120 needs this fix: https://github.com/triton-lang/triton/pull/8498 + and (9, 0) <= current_platform.get_device_capability() < (11, 0) + ) + if envs.VLLM_MXFP4_USE_MARLIN or not triton_kernels_supported: + logger.info_once("[get_mxfp4_backend_with_lora] Using Marlin backend") + return Mxfp4Backend.MARLIN + + logger.info_once("[get_mxfp4_backend_with_lora] Using Triton backend") + return Mxfp4Backend.TRITON def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend: @@ -854,6 +868,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): elif self.mxfp4_backend == Mxfp4Backend.MARLIN: return MarlinExperts(self.moe_quant_config) elif self.mxfp4_backend == Mxfp4Backend.TRITON: + if self.moe.is_lora_enabled: + return UnfusedOAITritonExperts(self.moe_quant_config) return OAITritonExperts(self.moe_quant_config) else: raise NotImplementedError( From 18523b87f67b12e9044d690dfe9da7cddc390627 Mon Sep 17 00:00:00 2001 From: Wilson Wu Date: Fri, 28 Nov 2025 10:53:55 +0800 Subject: [PATCH 051/197] [Docs] Update supported models for Olmo 3 in tool calling documentation (#29411) Signed-off-by: Wilson Wu --- docs/features/tool_calling.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md index dd79ba19b7247..22dda37279ac6 100644 --- a/docs/features/tool_calling.md +++ b/docs/features/tool_calling.md @@ -371,7 +371,8 @@ Olmo 3 models output tool calls in a format that is very similar to the one expe Supported models: -* TODO (will be updated after Olmo 3 release) +* `allenai/Olmo-3-7B-Instruct` +* `allenai/Olmo-3-32B-Think` Flags: `--tool-call-parser olmo3` From c7ba1f6bc762af8f231e6ee885725e7401d74578 Mon Sep 17 00:00:00 2001 From: maang-h <55082429+maang-h@users.noreply.github.com> Date: Fri, 28 Nov 2025 13:42:30 +0800 Subject: [PATCH 052/197] [BugFix] Fix ValueError in NewRequestData repr methods (#29392) Signed-off-by: maang --- tests/v1/core/test_output.py | 36 ++++++++++++++++++++++++++++++++++++ vllm/v1/core/sched/output.py | 8 ++++++-- 2 files changed, 42 insertions(+), 2 deletions(-) create mode 100644 tests/v1/core/test_output.py diff --git a/tests/v1/core/test_output.py b/tests/v1/core/test_output.py new file mode 100644 index 0000000000000..9dea19320e613 --- /dev/null +++ b/tests/v1/core/test_output.py @@ -0,0 +1,36 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import torch + +from vllm.v1.core.sched.output import NewRequestData + + +def _create_new_requests_data(prompt_embeds: torch.Tensor | None) -> NewRequestData: + return NewRequestData( + req_id="test_req", + prompt_token_ids=None, + mm_features=[], + sampling_params=None, + pooling_params=None, + block_ids=([],), + num_computed_tokens=0, + lora_request=None, + prompt_embeds=prompt_embeds, + ) + + +def test_repr_with_none() -> None: + """Test repr when prompt_embeds is None.""" + new_requests_data = _create_new_requests_data(None) + + assert "prompt_embeds_shape=None" in repr(new_requests_data) + assert "prompt_embeds_shape=None" in new_requests_data.anon_repr() + + +def test_repr_with_multi_element_tensor() -> None: + """Test repr when prompt_embeds is a multi-element tensor.""" + prompt_embeds = torch.randn(10, 768) + new_requests_data = _create_new_requests_data(prompt_embeds) + + assert "prompt_embeds_shape=torch.Size([10, 768])" in repr(new_requests_data) + assert "prompt_embeds_shape=torch.Size([10, 768])" in new_requests_data.anon_repr() diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py index abfab43499b2a..b69fa87ebddc8 100644 --- a/vllm/v1/core/sched/output.py +++ b/vllm/v1/core/sched/output.py @@ -68,7 +68,9 @@ class NewRequestData: ) def __repr__(self) -> str: - prompt_embeds_shape = self.prompt_embeds.shape if self.prompt_embeds else None + prompt_embeds_shape = ( + self.prompt_embeds.shape if self.prompt_embeds is not None else None + ) return ( f"NewRequestData(" f"req_id={self.req_id}," @@ -88,7 +90,9 @@ class NewRequestData: prompt_token_ids_len = ( len(self.prompt_token_ids) if self.prompt_token_ids is not None else None ) - prompt_embeds_shape = self.prompt_embeds.shape if self.prompt_embeds else None + prompt_embeds_shape = ( + self.prompt_embeds.shape if self.prompt_embeds is not None else None + ) return ( f"NewRequestData(" f"req_id={self.req_id}," From 37b15e97e8443a7fd76f5aa95a78d5593f7241a4 Mon Sep 17 00:00:00 2001 From: EanWang211123 Date: Fri, 28 Nov 2025 14:05:45 +0800 Subject: [PATCH 053/197] [Multimodal][Speculative Decoding]Eagle3 mm support, enablement on qwen3vl (#29594) Signed-off-by: Tsai, Louie Signed-off-by: EanWang211123 Co-authored-by: Louie Tsai Co-authored-by: Cyrus Leung --- tests/models/registry.py | 4 ++++ tests/v1/e2e/test_spec_decode.py | 14 ++++++++++++++ vllm/model_executor/models/qwen3_vl.py | 23 ++++++++++++++++++++++- vllm/model_executor/models/registry.py | 1 + vllm/v1/spec_decode/eagle.py | 8 ++++---- 5 files changed, 45 insertions(+), 5 deletions(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index c9d4823d52792..1f4a106c06b4b 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -913,6 +913,10 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { "Qwen/Qwen2.5-VL-7B-Instruct", speculative_model="Rayzl/qwen2.5-vl-7b-eagle3-sgl", ), + "Eagle3Qwen3vlForCausalLM": _HfExamplesInfo( + "Qwen/Qwen3-VL-8B-Instruct", + speculative_model="taobao-mnn/Qwen3-VL-8B-Instruct-Eagle3", + ), "Qwen3NextMTP": _HfExamplesInfo( "Qwen/Qwen3-Next-80B-A3B-Instruct", min_transformers_version="4.56.3" ), diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index 03396270a31cb..3a25f7411eecd 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -283,6 +283,19 @@ def test_speculators_model_integration( ["model_setup", "mm_enabled", "enable_chunked_prefill"], [ (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False, False), + pytest.param( + ( + "eagle3", + "Qwen/Qwen3-VL-8B-Instruct", + "taobao-mnn/Qwen3-VL-8B-Instruct-Eagle3", + 1, + ), + False, + False, + marks=pytest.mark.skip( + reason="architecture of its eagle3 is LlamaForCausalLMEagle3" + ), + ), pytest.param( ( "eagle3", @@ -352,6 +365,7 @@ def test_speculators_model_integration( ], ids=[ "qwen3_eagle3", + "qwen3_vl_eagle3", "qwen2_5_vl_eagle3", "llama3_eagle", "llama3_eagle3", diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 4cd6fa14c32df..52d31e70a8f05 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -89,6 +89,7 @@ from vllm.utils.collection_utils import is_list_of from .interfaces import ( MultiModalEmbeddings, + SupportsEagle3, SupportsLoRA, SupportsMRoPE, SupportsMultiModal, @@ -1122,9 +1123,14 @@ class Qwen3LLMModel(Qwen3Model): assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] + + aux_hidden_states = [] for layer_idx, layer in islice( enumerate(self.layers), self.start_layer, self.end_layer ): + if layer_idx in self.aux_hidden_state_layers: + aux_hidden_states.append(hidden_states + residual) + hidden_states, residual = layer( positions, hidden_states, @@ -1144,6 +1150,9 @@ class Qwen3LLMModel(Qwen3Model): {"hidden_states": hidden_states, "residual": residual} ) hidden_states, _ = self.norm(hidden_states, residual) + + if len(aux_hidden_states) > 0: + return hidden_states, aux_hidden_states return hidden_states @@ -1186,7 +1195,12 @@ class Qwen3LLMForCausalLM(Qwen3ForCausalLM): dummy_inputs=Qwen3VLDummyInputsBuilder, ) class Qwen3VLForConditionalGeneration( - nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE + nn.Module, + SupportsMultiModal, + SupportsLoRA, + SupportsPP, + SupportsMRoPE, + SupportsEagle3, ): merge_by_field_config = True multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"} @@ -1279,6 +1293,13 @@ class Qwen3VLForConditionalGeneration( self.visual_dim = config.vision_config.out_hidden_size self.multiscale_dim = self.visual_dim * self.deepstack_num_level + def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None: + self.language_model.model.aux_hidden_state_layers = layers + + def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]: + num_layers = len(self.language_model.model.layers) + return (2, num_layers // 2, num_layers - 3) + def _get_deepstack_input_embeds(self, num_tokens: int) -> IntermediateTensors: # get deepstack_input_embeds from buffer, and clear the buffer return IntermediateTensors( diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index ba9f33819c950..0d582043e8c02 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -414,6 +414,7 @@ _SPECULATIVE_DECODING_MODELS = { "Eagle3LlamaForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"), "LlamaForCausalLMEagle3": ("llama_eagle3", "Eagle3LlamaForCausalLM"), "Eagle3Qwen2_5vlForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"), + "Eagle3Qwen3vlForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"), "EagleDeepSeekMTPModel": ("deepseek_eagle", "EagleDeepseekV3ForCausalLM"), "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"), "ErnieMTPModel": ("ernie_mtp", "ErnieMTP"), diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 7600df48150ac..305abdade8da6 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -1017,10 +1017,10 @@ class EagleProposer: if supports_multimodal(target_model): # handle multimodality - if ( - self.get_model_name(target_model) - == "Qwen2_5_VLForConditionalGeneration" - ): + if self.get_model_name(target_model) in [ + "Qwen2_5_VLForConditionalGeneration", + "Qwen3VLForConditionalGeneration", + ]: self.model.config.image_token_index = target_model.config.image_token_id else: self.model.config.image_token_index = ( From f4b76056ee5c3a3f917527da5be3786e1b8530c6 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Fri, 28 Nov 2025 14:05:48 +0800 Subject: [PATCH 054/197] Improve enable chunked_prefill & prefix_caching logic. (#26623) Signed-off-by: wang.yuqi Signed-off-by: wang.yuqi Co-authored-by: Cyrus Leung --- .../pooling/test_auto_prefix_cache_support.py | 4 +- tests/test_config.py | 240 +++++++++++++++++- vllm/config/model.py | 109 ++++++++ vllm/config/pooler.py | 6 +- vllm/config/vllm.py | 76 ++---- vllm/engine/arg_utils.py | 90 +++---- vllm/model_executor/models/bert.py | 4 +- vllm/model_executor/models/interfaces_base.py | 35 ++- vllm/model_executor/models/modernbert.py | 3 +- vllm/model_executor/models/registry.py | 15 +- vllm/v1/engine/core.py | 7 +- 11 files changed, 456 insertions(+), 133 deletions(-) diff --git a/tests/models/language/pooling/test_auto_prefix_cache_support.py b/tests/models/language/pooling/test_auto_prefix_cache_support.py index 0904c7e877ef4..3795f2a5d8664 100644 --- a/tests/models/language/pooling/test_auto_prefix_cache_support.py +++ b/tests/models/language/pooling/test_auto_prefix_cache_support.py @@ -105,8 +105,6 @@ def test_embed_models( def test_non_causal_models( hf_runner, vllm_runner, example_prompts, model: str, dtype: str ) -> None: - with vllm_runner( - model, max_model_len=512, dtype=dtype, enable_prefix_caching=True - ) as vllm_model: + with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model: cache_config = vllm_model.llm.llm_engine.cache_config assert not cache_config.enable_prefix_caching diff --git a/tests/test_config.py b/tests/test_config.py index 080e4d2afacc6..112b02edd0389 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - +import logging import os from dataclasses import MISSING, Field, asdict, dataclass, field from unittest.mock import patch @@ -602,6 +602,244 @@ def test_s3_url_different_models_create_different_directories(mock_pull_files): assert os.path.exists(config2.tokenizer) and os.path.isdir(config2.tokenizer) +@pytest.mark.parametrize( + ("model_id", "expected_attn_type", "expected_result", "reason"), + [ + # pooling models + ( + "jason9693/Qwen2.5-1.5B-apeach", + "decoder", + True, + "Pooling models with causal attn and last pooling support chunked prefill.", + ), + ( + "Qwen/Qwen3-Embedding-0.6B", + "decoder", + True, + "Pooling models with causal attn and last pooling support chunked prefill.", + ), + ( + "Qwen/Qwen2.5-Math-PRM-7B", + "decoder", + False, + "Pooling models with step pooling does not support chunked prefill.", + ), + ( + "internlm/internlm2-1_8b-reward", + "decoder", + False, + "Pooling models with all pooling does not support chunked prefill.", + ), + ( + "BAAI/bge-base-en", + "encoder_only", + False, + "Pooling models with bidirectional attn does not support chunked prefill.", + ), + ( + "boltuix/NeuroBERT-NER", + "encoder_only", + False, + "Pooling models with bidirectional attn does not support chunked prefill.", + ), + ( + "papluca/xlm-roberta-base-language-detection", + "encoder_only", + False, + "Pooling models with bidirectional attn does not support chunked prefill.", + ), + ( + "Alibaba-NLP/gte-Qwen2-1.5B-instruct", + "encoder_only", + False, + "Pooling models with bidirectional attn does not support chunked prefill.", + ), + ( + "intfloat/e5-small", + "encoder_only", + False, + "Pooling models with bidirectional attn does not support chunked prefill.", + ), + # multimodal models + ( + "openai/clip-vit-base-patch32", + "decoder", + True, + "Pooling models with causal attn and last pooling support chunked prefill.", + ), + ( + "google/siglip-base-patch16-224", + "encoder_only", + False, + "Pooling models with bidirectional attn does not support chunked prefill.", + ), + # generate models + ( + "Qwen/Qwen3-0.6B", + "decoder", + True, + "Generative models support chunked prefill.", + ), + ( + "Qwen/Qwen3-Next-80B-A3B-Instruct", + "hybrid", + True, + "Generative models support chunked prefill.", + ), + ( + "ibm-granite/granite-4.0-h-small", + "hybrid", + True, + "Generative models support chunked prefill.", + ), + ( + "state-spaces/mamba-130m-hf", + "attention_free", + True, + "Generative models support chunked prefill.", + ), + # encoder_decoder models + ( + "openai/whisper-small", + "encoder_decoder", + False, + "Encoder decoder models does not support chunked prefill.", + ), + ], +) +def test_is_chunked_prefill_supported( + model_id: str, + expected_attn_type: str, + expected_result: bool, + reason: str, + caplog_vllm, +): + model_config = ModelConfig(model_id, trust_remote_code=True) + assert model_config.attn_type == expected_attn_type + with caplog_vllm.at_level(level=logging.DEBUG): + assert model_config.is_chunked_prefill_supported == expected_result + assert reason in caplog_vllm.text + + +@pytest.mark.parametrize( + ("model_id", "expected_attn_type", "expected_result", "reason"), + [ + # pooling models + ( + "jason9693/Qwen2.5-1.5B-apeach", + "decoder", + True, + "Pooling models with causal attn and last pooling support prefix caching.", + ), + ( + "Qwen/Qwen3-Embedding-0.6B", + "decoder", + True, + "Pooling models with causal attn and last pooling support prefix caching.", + ), + ( + "Qwen/Qwen2.5-Math-PRM-7B", + "decoder", + False, + "Pooling models with step pooling does not support prefix caching.", + ), + ( + "internlm/internlm2-1_8b-reward", + "decoder", + False, + "Pooling models with all pooling does not support prefix caching.", + ), + ( + "BAAI/bge-base-en", + "encoder_only", + False, + "Pooling models with bidirectional attn does not support prefix caching.", + ), + ( + "boltuix/NeuroBERT-NER", + "encoder_only", + False, + "Pooling models with bidirectional attn does not support prefix caching.", + ), + ( + "papluca/xlm-roberta-base-language-detection", + "encoder_only", + False, + "Pooling models with bidirectional attn does not support prefix caching.", + ), + ( + "Alibaba-NLP/gte-Qwen2-1.5B-instruct", + "encoder_only", + False, + "Pooling models with bidirectional attn does not support prefix caching.", + ), + ( + "intfloat/e5-small", + "encoder_only", + False, + "Pooling models with bidirectional attn does not support prefix caching.", + ), + # multimodal models + ( + "openai/clip-vit-base-patch32", + "decoder", + True, + "Pooling models with causal attn and last pooling support prefix caching.", + ), + ( + "google/siglip-base-patch16-224", + "encoder_only", + False, + "Pooling models with bidirectional attn does not support prefix caching.", + ), + # generate models + ( + "Qwen/Qwen3-0.6B", + "decoder", + True, + "Generative models support prefix caching.", + ), + ( + "Qwen/Qwen3-Next-80B-A3B-Instruct", + "hybrid", + False, + "Hybrid models does not support prefix caching since the feature is still experimental.", # noqa: E501 + ), + ( + "ibm-granite/granite-4.0-h-small", + "hybrid", + False, + "Hybrid models does not support prefix caching since the feature is still experimental.", # noqa: E501 + ), + ( + "state-spaces/mamba-130m-hf", + "attention_free", + False, + "Attention free models does not support prefix caching since the feature is still experimental.", # noqa: E501 + ), + # encoder_decoder models + ( + "openai/whisper-small", + "encoder_decoder", + False, + "Encoder decoder models does not support prefix caching.", + ), + ], +) +def test_is_prefix_caching_supported( + model_id: str, + expected_attn_type: str, + expected_result: bool, + reason: str, + caplog_vllm, +): + model_config = ModelConfig(model_id, trust_remote_code=True) + assert model_config.attn_type == expected_attn_type + with caplog_vllm.at_level(level=logging.DEBUG): + assert model_config.is_prefix_caching_supported == expected_result + assert reason in caplog_vllm.text + + @pytest.mark.parametrize( ("backend", "custom_ops", "expected"), [ diff --git a/vllm/config/model.py b/vllm/config/model.py index 21d602b30ac1a..b9ae4fec14efa 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -107,6 +107,10 @@ _RUNNER_CONVERTS: dict[RunnerType, list[ConvertType]] = { "draft": [], } +AttnTypeStr = Literal[ + "decoder", "encoder", "encoder_only", "encoder_decoder", "attention_free", "hybrid" +] + @config @dataclass(config=ConfigDict(arbitrary_types_allowed=True)) @@ -1752,6 +1756,111 @@ class ModelConfig: logger.info("Using max model len %s", max_model_len) return max_model_len + @property + def attn_type(self) -> AttnTypeStr: + if self.pooler_config is not None: + pooling_type = self._model_info.default_pooling_type.lower() + if pooling_type == "cls": + return "encoder_only" + else: + is_causal = getattr(self.hf_config, "is_causal", True) + return "encoder_only" if not is_causal else self._model_info.attn_type + elif self.is_hybrid: + return "hybrid" + elif self.is_attention_free: + return "attention_free" + elif self.is_encoder_decoder: + return "encoder_decoder" + else: + return "decoder" + + @property + def is_chunked_prefill_supported(self) -> bool: + attn_type = self.attn_type + if self.pooler_config is not None: + # for pooling models + if attn_type == "encoder_only": + logger.debug( + "Pooling models with bidirectional attn does not support " + "chunked prefill." + ) + return False + elif attn_type == "decoder": + pooling_type = self.pooler_config.pooling_type.lower() + if pooling_type in ["all", "mean", "step", "cls"]: + logger.debug( + "Pooling models with %s pooling does not " + "support chunked prefill.", + pooling_type, + ) + return False + else: + # pooling_type == "last" + logger.debug( + "Pooling models with causal attn and last pooling support " + "chunked prefill." + ) + return True + # vllm currently does not have pooling models using hybrid, + # attention_free or encoder_decoder attn types. + return attn_type != "encoder_decoder" + else: + if attn_type == "encoder_decoder": + logger.debug("Encoder decoder models does not support chunked prefill.") + return False + logger.debug("Generative models support chunked prefill.") + return True + + @property + def is_prefix_caching_supported(self) -> bool: + attn_type = self.attn_type + if self.pooler_config is not None: + # for pooling models + if attn_type == "encoder_only": + logger.debug( + "Pooling models with bidirectional attn does not " + "support prefix caching." + ) + return False + elif attn_type == "decoder": + pooling_type = self.pooler_config.pooling_type.lower() + if pooling_type in ["all", "mean", "step", "cls"]: + logger.debug( + "Pooling models with %s pooling does not " + "support prefix caching.", + pooling_type, + ) + return False + else: + # pooling_type == "last" + logger.debug( + "Pooling models with causal attn and last pooling support " + "prefix caching." + ) + return True + # vllm currently does not have pooling models using hybrid, + # attention_free or encoder_decoder attn types. + return False + else: + if attn_type == "hybrid": + logger.debug( + "Hybrid models does not support prefix caching since the feature " + "is still experimental." + ) + return False + elif attn_type == "attention_free": + logger.debug( + "Attention free models does not support prefix caching since the " + "feature is still experimental." + ) + return False + elif attn_type == "encoder_decoder": + logger.debug("Encoder decoder models does not support prefix caching.") + return False + else: # attn_type == "decoder" + logger.debug("Generative models support prefix caching.") + return True + def is_model_moe( self, ) -> bool: diff --git a/vllm/config/pooler.py b/vllm/config/pooler.py index 85950bbcd666f..aa4e7006d0247 100644 --- a/vllm/config/pooler.py +++ b/vllm/config/pooler.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any +from typing import Any, Literal from pydantic.dataclasses import dataclass @@ -11,13 +11,15 @@ from vllm.utils.hashing import safe_hash logger = init_logger(__name__) +PoolingTypeStr = Literal["LAST", "ALL", "CLS", "STEP", "MEAN"] + @config @dataclass class PoolerConfig: """Controls the behavior of output pooling in pooling models.""" - pooling_type: str | None = None + pooling_type: PoolingTypeStr | None = None """ The pooling method of the pooling model. This should be a key in [`vllm.model_executor.layers.pooler.PoolingType`][]. diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index c576275e80fe3..7ac8cc764322e 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -721,65 +721,27 @@ class VllmConfig: "correctness and to realize prefill savings. " ) - disable_chunked_prefill_reasons: list[str] = [] + if self.model_config and self.model_config.is_encoder_decoder: + from vllm.multimodal import MULTIMODAL_REGISTRY - if self.model_config: - if self.model_config.pooler_config: - pooling_type = self.model_config.pooler_config.pooling_type - if pooling_type is None or pooling_type.lower() != "last": - disable_chunked_prefill_reasons.append( - 'Only "last" pooling supports chunked ' - "prefill and prefix caching; disabling both." - ) - if not getattr(self.model_config.hf_config, "is_causal", True): - disable_chunked_prefill_reasons.append( - "Only models using causal attention support chunked " - "prefill and prefix caching; disabling both." - ) - elif self.model_config.is_encoder_decoder: - from vllm.multimodal import MULTIMODAL_REGISTRY - - self.scheduler_config.max_num_encoder_input_tokens = ( - MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config) - ) - logger.debug( - "Encoder-decoder model detected: setting " - "`max_num_encoder_input_tokens` to encoder length (%s)", - self.scheduler_config.max_num_encoder_input_tokens, - ) - if ( - self.model_config.architecture == "WhisperForConditionalGeneration" - and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn" - ): - logger.warning( - "Whisper is known to have issues with " - "forked workers. If startup is hanging, " - "try setting 'VLLM_WORKER_MULTIPROC_METHOD' " - "to 'spawn'." - ) - - # Final off-switch for CP/APC: - # Disable for (a) collected blockers, (b) encoder–decoder, or - # (c) explicit CP=False when APC wasn't requested. - # Do NOT disable merely because the resolved CP flag is False. - apc_requested = ( - self.cache_config is not None and self.cache_config.enable_prefix_caching - ) - if ( - disable_chunked_prefill_reasons - or (self.model_config is not None and self.model_config.is_encoder_decoder) - or ( - self.scheduler_config.enable_chunked_prefill is False - and not apc_requested + self.scheduler_config.max_num_encoder_input_tokens = ( + MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config) ) - ): - for reason in disable_chunked_prefill_reasons: - logger.info(reason) - self.scheduler_config.enable_chunked_prefill = False - self.scheduler_config.long_prefill_token_threshold = 0 - - if self.cache_config is not None: - self.cache_config.enable_prefix_caching = False + logger.debug( + "Encoder-decoder model detected: setting " + "`max_num_encoder_input_tokens` to encoder length (%s)", + self.scheduler_config.max_num_encoder_input_tokens, + ) + if ( + self.model_config.architecture == "WhisperForConditionalGeneration" + and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn" + ): + logger.warning( + "Whisper is known to have issues with " + "forked workers. If startup is hanging, " + "try setting 'VLLM_WORKER_MULTIPROC_METHOD' " + "to 'spawn'." + ) if ( self.kv_events_config is not None diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index e4c9a82d25223..ad5a34c56161c 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1349,30 +1349,10 @@ class EngineArgs: self.tokenizer = model_config.tokenizer self._check_feature_supported(model_config) - - # Set default arguments for V1 Engine. - self._set_default_args(usage_context, model_config) - # Disable chunked prefill and prefix caching for: - # POWER (ppc64le)/s390x/RISCV CPUs in V1 - if current_platform.is_cpu() and current_platform.get_cpu_architecture() in ( - CpuArchEnum.POWERPC, - CpuArchEnum.S390X, - CpuArchEnum.RISCV, - ): - logger.info( - "Chunked prefill is not supported for ARM and POWER, " - "S390X and RISC-V CPUs; " - "disabling it for V1 backend." - ) - self.enable_chunked_prefill = False - logger.info( - "Prefix caching is not supported for ARM and POWER, " - "S390X and RISC-V CPUs; " - "disabling it for V1 backend." - ) - self.enable_prefix_caching = False - - assert self.enable_chunked_prefill is not None + self._set_default_chunked_prefill_and_prefix_caching_args(model_config) + self._set_default_max_num_seqs_and_batched_tokens_args( + usage_context, model_config + ) sliding_window: int | None = None if not is_interleaved(model_config.hf_text_config): @@ -1805,34 +1785,6 @@ class EngineArgs: ) _raise_unsupported_error(feature_name=name) - @classmethod - def get_chunked_prefill_prefix_caching_defaults( - cls, - model_config: ModelConfig, - ) -> tuple[bool, bool]: - if model_config.runner_type != "pooling": - default_chunked_prefill = True - - # Disable prefix caching default for hybrid models and mamba-only - # models since the feature is still experimental. - default_prefix_caching = not ( - model_config.is_hybrid or model_config.is_attention_free - ) - else: - assert model_config.pooler_config is not None - - pooling_type = model_config.pooler_config.pooling_type - incremental_prefill_supported = ( - pooling_type is not None - and pooling_type.lower() == "last" - and getattr(model_config.hf_config, "is_causal", True) - ) - - default_chunked_prefill = incremental_prefill_supported - default_prefix_caching = incremental_prefill_supported - - return default_chunked_prefill, default_prefix_caching - @classmethod def get_batch_defaults( cls, @@ -1916,14 +1868,11 @@ class EngineArgs: return default_max_num_batched_tokens, default_max_num_seqs - def _set_default_args( - self, usage_context: UsageContext, model_config: ModelConfig + def _set_default_chunked_prefill_and_prefix_caching_args( + self, model_config: ModelConfig ) -> None: - """Set Default Arguments for V1 Engine.""" - ( - default_chunked_prefill, - default_prefix_caching, - ) = self.get_chunked_prefill_prefix_caching_defaults(model_config) + default_chunked_prefill = model_config.is_chunked_prefill_supported + default_prefix_caching = model_config.is_prefix_caching_supported if self.prefill_context_parallel_size > 1: default_chunked_prefill = False @@ -1984,6 +1933,29 @@ class EngineArgs: scope="local", ) + # Disable chunked prefill and prefix caching for: + # POWER (ppc64le)/s390x/RISCV CPUs in V1 + if current_platform.is_cpu() and current_platform.get_cpu_architecture() in ( + CpuArchEnum.POWERPC, + CpuArchEnum.S390X, + CpuArchEnum.RISCV, + ): + logger.info( + "Chunked prefill is not supported for ARM and POWER, " + "S390X and RISC-V CPUs; " + "disabling it for V1 backend." + ) + self.enable_chunked_prefill = False + logger.info( + "Prefix caching is not supported for ARM and POWER, " + "S390X and RISC-V CPUs; " + "disabling it for V1 backend." + ) + self.enable_prefix_caching = False + + def _set_default_max_num_seqs_and_batched_tokens_args( + self, usage_context: UsageContext, model_config: ModelConfig + ): world_size = self.pipeline_parallel_size * self.tensor_parallel_size ( default_max_num_batched_tokens, diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 2679448bce775..e774cd647ea8c 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -32,7 +32,7 @@ from vllm.tasks import PoolingTask from vllm.v1.pool.metadata import PoolingMetadata from .interfaces import SupportsCrossEncoding, SupportsQuant -from .interfaces_base import default_pooling_type +from .interfaces_base import attn_type, default_pooling_type from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix @@ -432,7 +432,6 @@ class BertModel(nn.Module, SupportsQuant): return loaded_params -@default_pooling_type("ALL") class BertPoolingModel(BertModel): is_pooling_model = True @@ -864,6 +863,7 @@ class BertForSequenceClassification(nn.Module, SupportsCrossEncoding, SupportsQu ) +@attn_type("encoder_only") @default_pooling_type("ALL") class BertForTokenClassification(nn.Module): is_pooling_model = True diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py index 85c5574bacf0a..2c99fce8d918c 100644 --- a/vllm/model_executor/models/interfaces_base.py +++ b/vllm/model_executor/models/interfaces_base.py @@ -19,10 +19,14 @@ from vllm.utils.func_utils import supports_kw if TYPE_CHECKING: from vllm.config import VllmConfig + from vllm.config.model import AttnTypeStr + from vllm.config.pooler import PoolingTypeStr from vllm.model_executor.layers.pooler import Pooler else: VllmConfig = Any Pooler = Any + PoolingTypeStr = Any + AttnTypeStr = Any logger = init_logger(__name__) @@ -165,7 +169,7 @@ class VllmModelForPooling(VllmModel[T_co], Protocol[T_co]): MRO of your model class. """ - default_pooling_type: ClassVar[str] = "LAST" + default_pooling_type: ClassVar[PoolingTypeStr] = "LAST" """ Indicates the [vllm.config.pooler.PoolerConfig.pooling_type][] to use by default. @@ -175,6 +179,17 @@ class VllmModelForPooling(VllmModel[T_co], Protocol[T_co]): decorator to conveniently set this field. """ + attn_type: ClassVar[AttnTypeStr] = "decoder" + """ + Indicates the + [vllm.config.model.ModelConfig.attn_type][] + to use by default. + + You can use the + [vllm.model_executor.models.interfaces_base.attn_type][] + decorator to conveniently set this field. + """ + pooler: Pooler """The pooler is only called on TP rank 0.""" @@ -199,7 +214,7 @@ def is_pooling_model( _T = TypeVar("_T", bound=type[nn.Module]) -def default_pooling_type(pooling_type: str): +def default_pooling_type(pooling_type: PoolingTypeStr): """Decorator to set `VllmModelForPooling.default_pooling_type`.""" def func(model: _T) -> _T: @@ -209,5 +224,19 @@ def default_pooling_type(pooling_type: str): return func -def get_default_pooling_type(model: type[object] | object) -> str: +def get_default_pooling_type(model: type[object] | object) -> PoolingTypeStr: return getattr(model, "default_pooling_type", "LAST") + + +def attn_type(attn_type: AttnTypeStr): + """Decorator to set `VllmModelForPooling.attn_type`.""" + + def func(model: _T) -> _T: + model.attn_type = attn_type # type: ignore + return model + + return func + + +def get_attn_type(model: type[object] | object) -> AttnTypeStr: + return getattr(model, "attn_type", "decoder") diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index 3a8a6c74d9d15..743bc23d9876f 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -28,7 +28,7 @@ from vllm.tasks import PoolingTask from vllm.v1.pool.metadata import PoolingMetadata from .interfaces import SupportsCrossEncoding -from .interfaces_base import default_pooling_type +from .interfaces_base import attn_type, default_pooling_type from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix @@ -396,6 +396,7 @@ class ModernBertPredictionHead(nn.Module): return self.norm(self.act(self.dense(hidden_states))) +@attn_type("encoder_only") @default_pooling_type("ALL") class ModernBertForTokenClassification(nn.Module): is_pooling_model = True diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 0d582043e8c02..73a61f1148b50 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -17,7 +17,7 @@ from collections.abc import Callable, Set from dataclasses import asdict, dataclass, field from functools import lru_cache from pathlib import Path -from typing import TypeVar +from typing import TYPE_CHECKING, Any, TypeVar import torch.nn as nn import transformers @@ -33,6 +33,14 @@ from vllm.logging_utils import logtime from vllm.transformers_utils.dynamic_module import try_get_class_from_dynamic_module from vllm.utils.hashing import safe_hash +if TYPE_CHECKING: + from vllm.config.model import AttnTypeStr + from vllm.config.pooler import PoolingTypeStr +else: + AttnTypeStr = Any + PoolingTypeStr = Any + + from .interfaces import ( has_inner_state, has_noops, @@ -47,6 +55,7 @@ from .interfaces import ( supports_transcription, ) from .interfaces_base import ( + get_attn_type, get_default_pooling_type, is_pooling_model, is_text_generation_model, @@ -509,7 +518,8 @@ class _ModelInfo: architecture: str is_text_generation_model: bool is_pooling_model: bool - default_pooling_type: str + attn_type: AttnTypeStr + default_pooling_type: PoolingTypeStr supports_cross_encoding: bool supports_multimodal: bool supports_multimodal_raw_input_only: bool @@ -530,6 +540,7 @@ class _ModelInfo: is_text_generation_model=is_text_generation_model(model), is_pooling_model=is_pooling_model(model), default_pooling_type=get_default_pooling_type(model), + attn_type=get_attn_type(model), supports_cross_encoding=supports_cross_encoding(model), supports_multimodal=supports_multimodal(model), supports_multimodal_raw_input_only=supports_multimodal_raw_input_only( diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 8657a95b5e6e7..e3a5f51a8fc56 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -119,11 +119,12 @@ class EngineCore: # Setup scheduler. Scheduler = vllm_config.scheduler_config.get_scheduler_cls() - if len(kv_cache_config.kv_cache_groups) == 0: + if len(kv_cache_config.kv_cache_groups) == 0: # noqa: SIM102 # Encoder models without KV cache don't support # chunked prefill. But do SSM models? - logger.info("Disabling chunked prefill for model without KVCache") - vllm_config.scheduler_config.enable_chunked_prefill = False + if vllm_config.scheduler_config.enable_chunked_prefill: + logger.warning("Disabling chunked prefill for model without KVCache") + vllm_config.scheduler_config.enable_chunked_prefill = False scheduler_block_size = ( vllm_config.cache_config.block_size From b34e8775a31c1a077a1a24f22ffbf048b2a979f6 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 28 Nov 2025 14:43:18 +0800 Subject: [PATCH 055/197] Revert "[CPU]Update CPU PyTorch to 2.9.0 (#29589)" (#29647) Signed-off-by: DarkLight1337 --- docker/Dockerfile.cpu | 4 ++++ requirements/cpu-build.txt | 4 ++-- requirements/cpu.txt | 8 ++++---- vllm/model_executor/models/qwen3_vl.py | 4 ++-- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index 67d3fb83a0275..eb3807ef0ca4e 100644 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -119,6 +119,7 @@ FROM base AS vllm-test-deps WORKDIR /workspace/vllm +# TODO: Update to 2.9.0 when there is a new build for intel_extension_for_pytorch for that version RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \ cp requirements/test.in requirements/cpu-test.in && \ sed -i '/mamba_ssm/d' requirements/cpu-test.in && \ @@ -131,6 +132,9 @@ RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \ esac; \ }; \ remove_packages_not_supported_on_aarch64 && \ + sed -i 's/^torch==.*/torch==2.8.0/g' requirements/cpu-test.in && \ + sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \ + sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \ uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu RUN --mount=type=cache,target=/root/.cache/uv \ diff --git a/requirements/cpu-build.txt b/requirements/cpu-build.txt index 0c6fdd3b33cd1..81d429a5e5f8d 100644 --- a/requirements/cpu-build.txt +++ b/requirements/cpu-build.txt @@ -4,9 +4,9 @@ packaging>=24.2 setuptools>=77.0.3,<81.0.0 setuptools-scm>=8 --extra-index-url https://download.pytorch.org/whl/cpu -torch==2.9.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x" +torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x" torch==2.9.0; platform_system == "Darwin" -torch==2.9.0; platform_machine == "ppc64le" or platform_machine == "aarch64" +torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64" scons; platform_machine == "aarch64" # needed to build Arm Compute Library (ACL) wheel jinja2>=3.1.6 diff --git a/requirements/cpu.txt b/requirements/cpu.txt index 8c04d6d5ce1b0..e23d3286f3f78 100644 --- a/requirements/cpu.txt +++ b/requirements/cpu.txt @@ -7,17 +7,17 @@ numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative d packaging>=24.2 setuptools>=77.0.3,<81.0.0 --extra-index-url https://download.pytorch.org/whl/cpu -torch==2.9.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x" +torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x" torch==2.9.0; platform_system == "Darwin" -torch==2.9.0; platform_machine == "ppc64le" or platform_machine == "aarch64" +torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64" # required for the image processor of minicpm-o-2_6, this must be updated alongside torch torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x" -torchaudio==2.9.0; platform_machine == "ppc64le" +torchaudio==2.8.0; platform_machine == "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch torchvision; platform_machine != "ppc64le" and platform_machine != "s390x" -torchvision==0.24.0; platform_machine == "ppc64le" +torchvision==0.23.0; platform_machine == "ppc64le" datasets # for benchmark scripts # Intel Extension for PyTorch, only for x86_64 CPUs diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 52d31e70a8f05..39fe8336b84a1 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -1123,14 +1123,14 @@ class Qwen3LLMModel(Qwen3Model): assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - + aux_hidden_states = [] for layer_idx, layer in islice( enumerate(self.layers), self.start_layer, self.end_layer ): if layer_idx in self.aux_hidden_state_layers: aux_hidden_states.append(hidden_states + residual) - + hidden_states, residual = layer( positions, hidden_states, From 480598958e28fa1e2ed2f7be2d457fc6f85a1748 Mon Sep 17 00:00:00 2001 From: "rongfu.leng" Date: Fri, 28 Nov 2025 15:53:20 +0800 Subject: [PATCH 056/197] [Feature][Bench] Add pareto visualization (#29477) Signed-off-by: rongfu.leng --- docs/contributing/benchmarks.md | 18 ++ docs/mkdocs/hooks/generate_argparse.py | 4 + vllm/benchmarks/sweep/cli.py | 3 + vllm/benchmarks/sweep/plot_pareto.py | 393 +++++++++++++++++++++++++ 4 files changed, 418 insertions(+) create mode 100644 vllm/benchmarks/sweep/plot_pareto.py diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md index c9bc9cfe28a35..e4714e6266381 100644 --- a/docs/contributing/benchmarks.md +++ b/docs/contributing/benchmarks.md @@ -1146,6 +1146,24 @@ vllm bench sweep plot benchmarks/results/ \ !!! tip You can use `--dry-run` to preview the figures to be plotted. +### Pareto visualization (tokens/s/user vs tokens/s/GPU) + +`vllm bench sweep plot_pareto` helps pick configurations that balance per-user and per-GPU throughput. + +Higher concurrency or batch size can raise GPU efficiency (per-GPU), but can add per user latency; lower concurrency improves per-user rate but underutilizes GPUs; The Pareto frontier shows the best achievable pairs across your runs. + +- x-axis: tokens/s/user = `output_throughput` ÷ concurrency (`--user-count-var`, default `max_concurrency`, fallback `max_concurrent_requests`). +- y-axis: tokens/s/GPU = `output_throughput` ÷ GPU count (`--gpu-count-var` if set; else gpu_count is TP×PP*DP). +- Output: a single figure at `OUTPUT_DIR/pareto/PARETO.png`. +- Show the configuration used in each data point `--label-by` (default: `max_concurrency,gpu_count`). + +Example: + +```bash +vllm bench sweep plot_pareto benchmarks/results/ \ + --label-by max_concurrency,tensor_parallel_size,pipeline_parallel_size +``` + ## Performance Benchmarks The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM. diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py index 735074c08b8c8..4ae64a6e4bfcc 100644 --- a/docs/mkdocs/hooks/generate_argparse.py +++ b/docs/mkdocs/hooks/generate_argparse.py @@ -94,6 +94,9 @@ def auto_mock(module_name: str, attr: str, max_mocks: int = 100): bench_latency = auto_mock("vllm.benchmarks", "latency") bench_serve = auto_mock("vllm.benchmarks", "serve") bench_sweep_plot = auto_mock("vllm.benchmarks.sweep.plot", "SweepPlotArgs") +bench_sweep_plot_pareto = auto_mock( + "vllm.benchmarks.sweep.plot_pareto", "SweepPlotParetoArgs" +) bench_sweep_serve = auto_mock("vllm.benchmarks.sweep.serve", "SweepServeArgs") bench_sweep_serve_sla = auto_mock( "vllm.benchmarks.sweep.serve_sla", "SweepServeSLAArgs" @@ -221,6 +224,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool): "bench_latency": create_parser(bench_latency.add_cli_args), "bench_serve": create_parser(bench_serve.add_cli_args), "bench_sweep_plot": create_parser(bench_sweep_plot.add_cli_args), + "bench_sweep_plot_pareto": create_parser(bench_sweep_plot_pareto.add_cli_args), "bench_sweep_serve": create_parser(bench_sweep_serve.add_cli_args), "bench_sweep_serve_sla": create_parser(bench_sweep_serve_sla.add_cli_args), "bench_throughput": create_parser(bench_throughput.add_cli_args), diff --git a/vllm/benchmarks/sweep/cli.py b/vllm/benchmarks/sweep/cli.py index 108cd75690864..e74e0e2c181c5 100644 --- a/vllm/benchmarks/sweep/cli.py +++ b/vllm/benchmarks/sweep/cli.py @@ -6,6 +6,8 @@ from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG from .plot import SweepPlotArgs from .plot import main as plot_main +from .plot_pareto import SweepPlotParetoArgs +from .plot_pareto import main as plot_pareto_main from .serve import SweepServeArgs from .serve import main as serve_main from .serve_sla import SweepServeSLAArgs @@ -15,6 +17,7 @@ SUBCOMMANDS = ( (SweepServeArgs, serve_main), (SweepServeSLAArgs, serve_sla_main), (SweepPlotArgs, plot_main), + (SweepPlotParetoArgs, plot_pareto_main), ) diff --git a/vllm/benchmarks/sweep/plot_pareto.py b/vllm/benchmarks/sweep/plot_pareto.py new file mode 100644 index 0000000000000..70472552b5cd4 --- /dev/null +++ b/vllm/benchmarks/sweep/plot_pareto.py @@ -0,0 +1,393 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse +import math +from concurrent.futures import ProcessPoolExecutor +from dataclasses import dataclass +from functools import partial +from pathlib import Path +from typing import ClassVar + +from vllm.utils.collection_utils import full_groupby +from vllm.utils.import_utils import PlaceholderModule + +from .plot import DummyExecutor, _json_load_bytes +from .utils import sanitize_filename + +try: + import matplotlib.pyplot as plt + import pandas as pd + import seaborn as sns +except ImportError: + plt = PlaceholderModule("matplotlib").placeholder_attr("pyplot") + pd = PlaceholderModule("pandas") + sns = PlaceholderModule("seaborn") + + +def _first_present(run_data: dict[str, object], keys: list[str]): + for key in keys: + for candidate in {key, key.replace("_", "-"), key.replace("-", "_")}: + if candidate in run_data: + return run_data[candidate] + return None + + +def _get_numeric( + run_data: dict[str, object], + keys: list[str], + *, + allow_zero: bool = True, +) -> float | None: + value = _first_present(run_data, keys) + if value is None: + return None + + try: + numeric = float(value) + except (TypeError, ValueError) as exc: + raise ValueError( + f"Expected numeric value for one of {keys}, " + f"but found {value!r} in {run_data=}" + ) from exc + + if not allow_zero and numeric == 0: + return None + + return numeric + + +def _infer_user_count( + run_data: dict[str, object], + user_count_var: str | None, +) -> float | None: + candidates = [user_count_var] if user_count_var else [] + candidates.extend(["request_rate"]) + user_count = _get_numeric(run_data, candidates, allow_zero=False) + if user_count is not None: + return user_count + + # Fallback to the observed peak if configured value is missing. + return _get_numeric(run_data, ["max_concurrent_requests"], allow_zero=False) + + +def _infer_gpu_count( + run_data: dict[str, object], + gpu_count_var: str | None, +) -> float: + direct_candidates = [gpu_count_var] if gpu_count_var else [] + direct_gpu_count = _get_numeric(run_data, direct_candidates, allow_zero=False) + if direct_gpu_count: + return direct_gpu_count + + tp_size = _get_numeric(run_data, ["tensor_parallel_size", "tp"]) + pp_size = _get_numeric(run_data, ["pipeline_parallel_size", "pp"]) + dp_size = _get_numeric(run_data, ["data_parallel_size", "dp"]) + world_size = 1.0 + if tp_size: + world_size *= tp_size + if pp_size: + world_size *= pp_size + if dp_size: + world_size *= dp_size + + return world_size + + +def _get_throughput( + run_data: dict[str, object], + throughput_var: str, +) -> float: + throughput = _get_numeric(run_data, [throughput_var]) + if throughput is None: + raise ValueError( + f"Cannot find throughput metric {throughput_var!r} in run data. " + f"Available keys: {sorted(run_data)}" + ) + + return throughput + + +def _prepare_records( + all_data: list[dict[str, object]], + *, + user_count_var: str | None, + gpu_count_var: str | None, +) -> tuple[list[dict[str, object]], int]: + prepared = [] + skipped_missing_users = 0 + + for record in all_data: + throughput = _get_throughput(record, "output_throughput") + user_count = _infer_user_count(record, user_count_var) + if user_count is None: + skipped_missing_users += 1 + continue + + gpu_count = _infer_gpu_count(record, gpu_count_var) + tokens_per_user = throughput / user_count + tokens_per_gpu = throughput / gpu_count + + prepared.append( + { + **record, + "tokens_per_user": tokens_per_user, + "tokens_per_gpu": tokens_per_gpu, + "user_count_estimate": user_count, + "gpu_count": gpu_count, + } + ) + + return prepared, skipped_missing_users + + +def _pareto_frontier( + df: "pd.DataFrame", + x_col: str, + y_col: str, + *, + epsilon: float = 1e-9, +) -> "pd.DataFrame": + sorted_df = df.sort_values([x_col, y_col], ascending=[False, False]) + frontier_indices = [] + best_y = -math.inf + + for idx, row in sorted_df.iterrows(): + y_val = row[y_col] + if y_val >= best_y - epsilon: + frontier_indices.append(idx) + best_y = max(best_y, y_val) + + return df.loc[frontier_indices] + + +def _get_fig_path( + fig_dir: Path, + fig_group: tuple[tuple[str, str], ...], +) -> Path: + parts = ["PARETO"] + if fig_group: + parts.extend(f"{k}={v}" for k, v in fig_group) + filename = sanitize_filename("-".join(parts) + ".png") + return fig_dir / filename + + +def _plot_fig( + fig_dir: Path, + fig_group_data: tuple[tuple[tuple[str, str], ...], list[dict[str, object]]], + label_by: list[str], + *, + dry_run: bool, +): + fig_group, fig_data = fig_group_data + fig_path = _get_fig_path(fig_dir, fig_group) + + print("[BEGIN FIGURE]") + print(f"Group: {dict(fig_group)}") + print(f"Output file: {fig_path}") + + if dry_run: + print("[END FIGURE]") + return + + df = pd.DataFrame.from_records(fig_data) + df = df.dropna(subset=["tokens_per_user", "tokens_per_gpu"]) + + if df.empty: + print("No data points available after filtering; skipping.") + print("[END FIGURE]") + return + + frontier = _pareto_frontier(df, "tokens_per_user", "tokens_per_gpu") + frontier = frontier.sort_values("tokens_per_user") + + fig, ax = plt.subplots() + sns.scatterplot( + data=df, + x="tokens_per_user", + y="tokens_per_gpu", + color="0.5", + alpha=0.6, + ax=ax, + label="All runs", + ) + sns.lineplot( + data=frontier, + x="tokens_per_user", + y="tokens_per_gpu", + marker="o", + ax=ax, + label="Pareto frontier", + ) + + if label_by: + for _, row in frontier.iterrows(): + label_parts = [] + for key in label_by: + if key in row: + label_parts.append(f"{key}={row[key]}") + if label_parts: + ax.text( + row["tokens_per_user"], + row["tokens_per_gpu"], + "\n".join(label_parts), + fontsize=8, + ) + + ax.set_xlabel("Tokens/s/user") + ax.set_ylabel("Tokens/s/GPU") + ax.grid(True, linestyle="--", linewidth=0.5, alpha=0.6) + fig.tight_layout() + fig.savefig(fig_path) + plt.close(fig) + + print( + f"Plotted {len(df)} points; Pareto frontier size: {len(frontier)}.", + ) + print("[END FIGURE]") + + +def plot_pareto( + output_dir: Path, + user_count_var: str | None, + gpu_count_var: str | None, + label_by: list[str], + *, + dry_run: bool, +): + fig_dir = output_dir / "pareto" + raw_data = [ + run_data + for path in output_dir.rglob("**/summary.json") + for run_data in _json_load_bytes(path) + ] + + if not raw_data: + raise ValueError(f"Did not find any parameter sweep results under {output_dir}") + + fig_dir.mkdir(parents=True, exist_ok=True) + + prepared_data, skipped_missing_users = _prepare_records( + raw_data, + user_count_var=user_count_var, + gpu_count_var=gpu_count_var, + ) + + if skipped_missing_users: + print( + f"Skipped {skipped_missing_users} runs without a user count " + "(`max_concurrency` or `max_concurrent_requests`).", + ) + + if not prepared_data: + raise ValueError( + "No data points with both throughput and user count available " + "to plot Pareto frontier.", + ) + + fig_groups = full_groupby( + prepared_data, + key=lambda item: tuple(), + ) + + with DummyExecutor() if len(fig_groups) <= 1 else ProcessPoolExecutor() as executor: + all( + executor.map( + partial( + _plot_fig, + fig_dir, + label_by=label_by, + dry_run=dry_run, + ), + fig_groups, + ) + ) + + +@dataclass +class SweepPlotParetoArgs: + output_dir: Path + user_count_var: str | None + gpu_count_var: str | None + label_by: list[str] + dry_run: bool + + parser_name: ClassVar[str] = "plot_pareto" + parser_help: ClassVar[str] = ( + "Plot Pareto frontier between tokens/s/user and tokens/s/GPU " + "from parameter sweep results." + ) + + @classmethod + def from_cli_args(cls, args: argparse.Namespace): + output_dir = Path(args.OUTPUT_DIR) + if not output_dir.exists(): + raise ValueError(f"No parameter sweep results under {output_dir}") + + label_by = [] if not args.label_by else args.label_by.split(",") + + return cls( + output_dir=output_dir, + user_count_var=args.user_count_var, + gpu_count_var=args.gpu_count_var, + label_by=label_by, + dry_run=args.dry_run, + ) + + @classmethod + def add_cli_args(cls, parser: argparse.ArgumentParser): + parser.add_argument( + "OUTPUT_DIR", + type=str, + default="results", + help="The directory containing the sweep results to plot.", + ) + parser.add_argument( + "--user-count-var", + type=str, + default="max_concurrency", + help="Result key that stores concurrent user count. " + "Falls back to max_concurrent_requests if missing.", + ) + parser.add_argument( + "--gpu-count-var", + type=str, + default=None, + help="Result key that stores GPU count. " + "If not provided, falls back to num_gpus/gpu_count " + "or tensor_parallel_size * pipeline_parallel_size.", + ) + parser.add_argument( + "--label-by", + type=str, + default="max_concurrency,gpu_count", + help="Comma-separated list of fields to annotate on Pareto frontier " + "points.", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="If set, prints the figures to plot without drawing them.", + ) + + return parser + + +def run_main(args: SweepPlotParetoArgs): + return plot_pareto( + output_dir=args.output_dir, + user_count_var=args.user_count_var, + gpu_count_var=args.gpu_count_var, + label_by=args.label_by, + dry_run=args.dry_run, + ) + + +def main(args: argparse.Namespace): + run_main(SweepPlotParetoArgs.from_cli_args(args)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=SweepPlotParetoArgs.parser_help) + SweepPlotParetoArgs.add_cli_args(parser) + + main(parser.parse_args()) From cc0f2a0e19881c3c601d3e287f297b36d2a78f78 Mon Sep 17 00:00:00 2001 From: maang-h <55082429+maang-h@users.noreply.github.com> Date: Fri, 28 Nov 2025 16:12:20 +0800 Subject: [PATCH 057/197] [Doc] Improve abnormal information string (#29655) Signed-off-by: maang --- vllm/v1/engine/utils.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py index d65cad7af03d6..24bf66c42f312 100644 --- a/vllm/v1/engine/utils.py +++ b/vllm/v1/engine/utils.py @@ -371,8 +371,7 @@ class CoreEngineActorManager: ) assert len(nodes) > 0, "No nodes with resources found in Ray cluster." assert dp_master_ip_key in nodes[0], ( - "The DP master node (ip: %s) is missing or dead", - dp_master_ip, + f"The DP master node (ip: {dp_master_ip}) is missing or dead" ) device_str = current_platform.ray_device_key n_node_devices: list[int] = [ @@ -446,8 +445,7 @@ class CoreEngineActorManager: if key != "node:__internal_head__" and key.startswith("node:") ] assert len(node_ip_keys) == 1, ( - "Zero or multiple node IP keys found in node resources: %s", - node_ip_keys, + f"Zero or multiple node IP keys found in node resources: {node_ip_keys}" ) node_ip_key = node_ip_keys[0] node_ip = node_ip_key.split(":")[1] @@ -464,11 +462,9 @@ class CoreEngineActorManager: if node_ip == dp_master_ip: if dp_size_available < dp_size_local: raise ValueError( - "Not enough resources to allocate %s DP ranks " - "on DP master node %s, possible to fit %s DP ranks", - dp_size_local, - dp_master_ip, - dp_size_available, + f"Not enough resources to allocate {dp_size_local} DP ranks " + f"on DP master node {dp_master_ip}, possible to fit " + f"{dp_size_available} DP ranks." ) dp_size_to_allocate = dp_size_local elif pack_strategy == "strict": From b2c1d294faca96643dbc2413d604ca160f458f0d Mon Sep 17 00:00:00 2001 From: Julien Denize <40604584+juliendenize@users.noreply.github.com> Date: Fri, 28 Nov 2025 09:44:47 +0100 Subject: [PATCH 058/197] [BUGFIX] MistralTokenizer._call__ adds an invalid EOS token (#29607) Signed-off-by: Julien Denize Signed-off-by: Julien Denize <40604584+juliendenize@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Cyrus Leung --- tests/tokenization/test_mistral_tokenizer.py | 68 +++++++++++++++++++ vllm/transformers_utils/tokenizers/mistral.py | 20 +++++- 2 files changed, 87 insertions(+), 1 deletion(-) diff --git a/tests/tokenization/test_mistral_tokenizer.py b/tests/tokenization/test_mistral_tokenizer.py index 1ada8ee187c38..c80b698ba3848 100644 --- a/tests/tokenization/test_mistral_tokenizer.py +++ b/tests/tokenization/test_mistral_tokenizer.py @@ -331,6 +331,7 @@ class TestMistralTokenizer: ) == token_ids ) + assert mistral_tokenizer.encode_one("") == [] def test_encode(self, mistral_tokenizer: MistralTokenizer): token_ids = ( @@ -370,6 +371,51 @@ class TestMistralTokenizer: mistral_tokenizer.encode("Hello world !", add_special_tokens=False) == token_ids[1:] ) + assert mistral_tokenizer.encode("", add_special_tokens=False) == [] + + def test_call(self, mistral_tokenizer: MistralTokenizer): + token_ids = ( + [1, 22177, 4304, 2662] + if mistral_tokenizer.is_tekken + else [1, 23325, 2294, 1686] + ) + attn_mask = [1 for _ in range(len(token_ids))] + + # Test 1: default + assert mistral_tokenizer("Hello world !") == { + "attention_mask": attn_mask[1:], + "input_ids": token_ids[1:], + } + # Test 2: special tokens + assert mistral_tokenizer("Hello world !", add_special_tokens=True) == { + "attention_mask": attn_mask, + "input_ids": token_ids, + } + # Test 3: special tokens + truncation + assert mistral_tokenizer( + "Hello world !", add_special_tokens=True, truncation=True, max_length=3 + ) == { + "attention_mask": attn_mask[:-1], + "input_ids": token_ids[:-1], + } + # Test 4: special tokens + no truncation + max length + assert mistral_tokenizer( + "Hello world !", add_special_tokens=True, max_length=3 + ) == { + "attention_mask": attn_mask, + "input_ids": token_ids, + } + # Test 5: empty string + assert mistral_tokenizer("") == { + "attention_mask": [], + "input_ids": [], + } + + with pytest.raises( + ValueError, + match=(r"`text_pair` is not supported by `MistralTokenizer.__call__`."), + ): + mistral_tokenizer("Hello world !", "invalid pair") @pytest.mark.parametrize( "openai_request,add_generation_prompt,continue_final_message,expected_output,decoded_expected_output", @@ -1087,6 +1133,24 @@ class TestMistralTokenizer: ) == expected_tokens[mistral_tokenizer.is_tekken] ) + assert ( + mistral_tokenizer.decode( + ids[mistral_tokenizer.is_tekken], + skip_special_tokens=skip_special_tokens, + ) + == expected_tokens[mistral_tokenizer.is_tekken] + ) + + def test_decode_empty( + self, + mistral_tokenizer: MistralTokenizer, + ): + assert ( + mistral_tokenizer.decode( + [], + ) + == "" + ) def test_decode_int( self, @@ -1390,6 +1454,8 @@ class TestMistralTokenizer: == expected_strings[mistral_tokenizer.is_tekken] ) + assert mistral_tokenizer.convert_tokens_to_string([]) == "" + @pytest.mark.parametrize( "skip_special_tokens,tuple_expected_tokens", ( @@ -2220,3 +2286,5 @@ class TestMistralTokenizer: ids, skip_special_tokens=skip_special_tokens ) assert actual_tokens == expected_tokens + + assert mistral_tokenizer.convert_ids_to_tokens([]) == [] diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 39198a1f3d815..caff43c55ce85 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -312,13 +312,27 @@ class MistralTokenizer(TokenizerBase): truncation: bool = False, max_length: int | None = None, ): - return self.transformers_tokenizer( + if text_pair is not None: + raise ValueError( + "`text_pair` is not supported by `MistralTokenizer.__call__`." + ) + + encoded = self.transformers_tokenizer( text=text, text_pair=text_pair, add_special_tokens=add_special_tokens, truncation=truncation, max_length=max_length, ) + # TODO(juliendenize): once https://github.com/huggingface/transformers/pull/41962 + # is in, revert to only call self.transformers_tokenizer(...). + # Hack to fix wrongly added eos token, when fix will be supported the condition + # below will be False even before the revert is done. + if encoded["input_ids"] and encoded["input_ids"][-1] == self.eos_token_id: + encoded["input_ids"].pop(-1) + if attention_mask := encoded.get("attention_mask"): + attention_mask.pop(-1) + return encoded @property def vocab(self) -> list[str]: @@ -349,6 +363,8 @@ class MistralTokenizer(TokenizerBase): max_length: int | None = None, add_special_tokens: bool | None = None, ) -> list[int]: + # TODO(juliendenize): once https://github.com/huggingface/transformers/pull/41962 + # is in, directly call self.transformers_tokenizer.encode(...). encoded = self.tokenizer.encode( text, bos=add_special_tokens is not False, eos=False ) @@ -387,6 +403,8 @@ class MistralTokenizer(TokenizerBase): ) def decode(self, ids: list[int] | int, skip_special_tokens: bool = True) -> str: + # TODO(juliendenize): once https://github.com/huggingface/transformers/pull/41962 + # is in, directly call self.transformers_tokenizer.decode(...). if isinstance(ids, int): ids = [ids] From 5f5521bd5d7d38d380640166294d97a839cf7ef9 Mon Sep 17 00:00:00 2001 From: Filipp Fisin <48059208+qGentry@users.noreply.github.com> Date: Fri, 28 Nov 2025 09:45:10 +0100 Subject: [PATCH 059/197] Fix parameter order in GPT-OSS weight loading function for non-MXFP4 weights (#29506) Signed-off-by: Filipp Fisin <48059208+qGentry@users.noreply.github.com> Co-authored-by: Cyrus Leung --- vllm/model_executor/models/gpt_oss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 9de3e261941b1..cff16b7a7a8cd 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -647,8 +647,8 @@ class GptOssModel(nn.Module): ) else: return self._load_weights_other( - ep_rank_start, ep_rank_end, + ep_rank_start, heads_per_rank, head_start, weights, From ccbdf51bd57761a7a7e7a5adf685fcec67c9c1bd Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 28 Nov 2025 17:19:25 +0800 Subject: [PATCH 060/197] [Doc] Reorganize benchmark docs (#29658) Signed-off-by: DarkLight1337 --- docs/.nav.yml | 5 + docs/benchmarking/README.md | 7 + .../benchmarks.md => benchmarking/cli.md} | 335 +++--------------- docs/benchmarking/dashboard.md | 58 +++ docs/benchmarking/sweeps.md | 178 ++++++++++ 5 files changed, 291 insertions(+), 292 deletions(-) create mode 100644 docs/benchmarking/README.md rename docs/{contributing/benchmarks.md => benchmarking/cli.md} (71%) create mode 100644 docs/benchmarking/dashboard.md create mode 100644 docs/benchmarking/sweeps.md diff --git a/docs/.nav.yml b/docs/.nav.yml index c8bf00efb2370..d30c0f12eba4c 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -52,6 +52,11 @@ nav: - Plugins: - design/*plugin*.md - design/* + - Benchmarking: + - benchmarking/README.md + - benchmarking/cli.md + - benchmarking/sweeps.md + - benchmarking/dashboard.md - API Reference: - api/README.md - api/vllm diff --git a/docs/benchmarking/README.md b/docs/benchmarking/README.md new file mode 100644 index 0000000000000..238290d4762b3 --- /dev/null +++ b/docs/benchmarking/README.md @@ -0,0 +1,7 @@ +# Benchmark Suites + +vLLM provides comprehensive benchmarking tools for performance testing and evaluation: + +- **[Benchmark CLI](./cli.md)**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing. +- **[Parameter Sweeps](./sweeps.md)**: Automate `vllm bench` runs for multiple configurations, useful for [optimization and tuning](../configuration/optimization.md). +- **[Performance Dashboard](./dashboard.md)**: Automated CI that publishes benchmarks on each commit. diff --git a/docs/contributing/benchmarks.md b/docs/benchmarking/cli.md similarity index 71% rename from docs/contributing/benchmarks.md rename to docs/benchmarking/cli.md index e4714e6266381..44a4c40125952 100644 --- a/docs/contributing/benchmarks.md +++ b/docs/benchmarking/cli.md @@ -1,22 +1,10 @@ ---- -toc_depth: 4 ---- +# Benchmark CLI -# Benchmark Suites +This section guides you through running benchmark tests with the extensive datasets supported on vLLM. -vLLM provides comprehensive benchmarking tools for performance testing and evaluation: +It's a living document, updated as new features and datasets become available. -- **[Benchmark CLI](#benchmark-cli)**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing -- **[Parameter sweeps](#parameter-sweeps)**: Automate `vllm bench` runs for multiple configurations -- **[Performance benchmarks](#performance-benchmarks)**: Automated CI benchmarks for development - -## Benchmark CLI - -This section guides you through running benchmark tests with the extensive -datasets supported on vLLM. It's a living document, updated as new features and datasets -become available. - -### Dataset Overview +## Dataset Overview