From bb5ac1fe38c9fcc7bafaee47fd45c8d1696ad176 Mon Sep 17 00:00:00 2001 From: Fadi Arafeh <115173828+fadara01@users.noreply.github.com> Date: Wed, 17 Dec 2025 04:21:07 +0000 Subject: [PATCH 001/176] [CPU] Add action to automatically label CPU related PRs (#30678) Signed-off-by: Fadi Arafeh --- .github/mergify.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/.github/mergify.yml b/.github/mergify.yml index 3ad79f93bc7ad..3e4e21efe39df 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -235,6 +235,20 @@ pull_request_rules: add: - rocm +- name: label-cpu + description: Automatically apply cpu label + conditions: + - label != stale + - files~=^(?!.*kv_offload)(?!.*cpu_offload).*\bcpu.* + actions: + label: + add: + - cpu + assign: + users: + - "fadara01" + - "aditew01" + - name: label-structured-output description: Automatically apply structured-output label conditions: From 44d3b1df3d6416b76d84c360d751b8f5220c0b11 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 17 Dec 2025 12:21:19 +0800 Subject: [PATCH 002/176] [CI/Build] Fix compatibility between #30244 and #30396 (#30787) Signed-off-by: DarkLight1337 --- tests/compile/distributed/test_fusions_e2e.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py index 80086c4e03a9c..960b5b4bd7ad4 100644 --- a/tests/compile/distributed/test_fusions_e2e.py +++ b/tests/compile/distributed/test_fusions_e2e.py @@ -564,7 +564,9 @@ def test_rms_group_quant( splitting_ops=splitting_ops, # Common mode=CompilationMode.VLLM_COMPILE, - pass_config=PassConfig(eliminate_noops=True, fuse_norm_quant=True), + pass_config=PassConfig( + fuse_norm_quant=True, fuse_act_quant=True, eliminate_noops=True + ), # Inductor caches custom passes by default as well via uuid inductor_compile_config={"force_disable_caches": True}, ) From 009a773828fee13504ee2976ad02abb6020152c8 Mon Sep 17 00:00:00 2001 From: shanjiaz <43143795+shanjiaz@users.noreply.github.com> Date: Wed, 17 Dec 2025 00:01:04 -0500 Subject: [PATCH 003/176] bump up compressed tensors version to 0.13.0 (#30799) Signed-off-by: shanjiaz Co-authored-by: Dipika Sikka --- requirements/common.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/common.txt b/requirements/common.txt index 426d281c26704..7c89385da6ba5 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -37,7 +37,7 @@ pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 einops # Required for Qwen2-VL. -compressed-tensors == 0.12.2 # required for compressed-tensors +compressed-tensors == 0.13.0 # required for compressed-tensors depyf==0.20.0 # required for profiling and debugging with compilation config cloudpickle # allows pickling lambda functions in model_executor/models/registry.py watchfiles # required for http server to monitor the updates of TLS files From d4d2751732c3ccae162a5a0160c7d4fe05d2779a Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 17 Dec 2025 00:29:03 -0500 Subject: [PATCH 004/176] Update note comment for flashinfer attention warmup (#30711) Signed-off-by: mgoin --- vllm/model_executor/warmup/kernel_warmup.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/warmup/kernel_warmup.py b/vllm/model_executor/warmup/kernel_warmup.py index 95f5982bc8c7b..98b28d3e5292f 100644 --- a/vllm/model_executor/warmup/kernel_warmup.py +++ b/vllm/model_executor/warmup/kernel_warmup.py @@ -49,13 +49,12 @@ def kernel_warmup(worker: "Worker"): except NotImplementedError: return False - # NOTE: we add check for empty attn_groups to avoid errors when - # deploying models such as E instances and encoder-only models. - # As for those models, worker.model_runner.attn_groups is empty. - # This change is made during EPD feature development. if ( not worker.model_runner.is_pooling_model and worker.model_runner.attn_groups + # NOTE: This should be `any` instead of `all` but other hybrid attention + # backends don't support this dummy run. Once we remove + # `build_for_cudagraph_capture`, we can change it to `any`. and all( _is_flashinfer_backend(group.backend) for groups in worker.model_runner.attn_groups From 0cd5353644d3d045ab33c7e8e19c182bfd7db911 Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Wed, 17 Dec 2025 15:25:12 +0800 Subject: [PATCH 005/176] [Bugfix][CPU] Fix CPU backend ROPE dispatch for VL models (#30829) Signed-off-by: jiang1.li Signed-off-by: Li, Jiang Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/model_executor/layers/rotary_embedding/common.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vllm/model_executor/layers/rotary_embedding/common.py b/vllm/model_executor/layers/rotary_embedding/common.py index 3e6584dbc3da0..50660c6ecc223 100644 --- a/vllm/model_executor/layers/rotary_embedding/common.py +++ b/vllm/model_executor/layers/rotary_embedding/common.py @@ -264,6 +264,15 @@ class ApplyRotaryEmb(CustomOp): return output + def forward_cpu( + self, + x: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + ) -> torch.Tensor: + # TODO (bigPYJ1151): need to enable fused CPU ROPE here + return self.forward_native(x, cos, sin) + def extra_repr(self) -> str: s = f"is_neox_style={self.is_neox_style}" s += f"enable_fp32_compute={self.enable_fp32_compute}" From 4f735babb7353987137b85ec0465e594e9ed1384 Mon Sep 17 00:00:00 2001 From: Yan Ma Date: Wed, 17 Dec 2025 16:28:13 +0800 Subject: [PATCH 006/176] [XPU] fix broken fp8 online quantization for XPU platform (#30831) Signed-off-by: Yan Ma --- .../layers/quantization/ipex_quant.py | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py index 463c74c1c1482..f33ee43727f19 100644 --- a/vllm/model_executor/layers/quantization/ipex_quant.py +++ b/vllm/model_executor/layers/quantization/ipex_quant.py @@ -27,6 +27,10 @@ from vllm.model_executor.layers.quantization.awq import AWQLinearMethod from vllm.model_executor.layers.quantization.fp8 import Fp8Config, Fp8LinearMethod from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod from vllm.model_executor.layers.quantization.utils.quant_utils import is_layer_skipped +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + maybe_create_device_identity, +) +from vllm.model_executor.parameter import ModelWeightParameter from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform @@ -305,6 +309,37 @@ class XPUFp8LinearMethod(Fp8LinearMethod): def __init__(self, quant_config: Fp8Config): super().__init__(quant_config) + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + maybe_create_device_identity() + + output_size_per_partition = sum(output_partition_sizes) + weight_loader = extra_weight_attrs.get("weight_loader") + layer.logical_widths = output_partition_sizes + layer.input_size_per_partition = input_size_per_partition + layer.output_size_per_partition = output_size_per_partition + layer.orig_dtype = params_dtype + layer.weight_block_size = None + weight = ModelWeightParameter( + data=torch.empty( + output_size_per_partition, + input_size_per_partition, + dtype=params_dtype, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + layer.register_parameter("weight", weight) + def process_weights_after_loading(self, layer: Module) -> None: # If checkpoint not serialized fp8, quantize the weights. if not self.quant_config.is_checkpoint_fp8_serialized: From 20fda431515d19a883cc962d3a1fa727f225e82d Mon Sep 17 00:00:00 2001 From: Robin <863579016@qq.com> Date: Wed, 17 Dec 2025 16:37:57 +0800 Subject: [PATCH 007/176] [Bugfix][Frontend] Prevent IndexError in MiniMax M2 tool parser during streaming extraction (#30555) Signed-off-by: WangErXiao <863579016@qq.com> --- tests/tool_use/test_minimax_m2_tool_parser.py | 119 ++++++++++++++++++ vllm/tool_parsers/minimax_m2_tool_parser.py | 22 +++- 2 files changed, 137 insertions(+), 4 deletions(-) create mode 100644 tests/tool_use/test_minimax_m2_tool_parser.py diff --git a/tests/tool_use/test_minimax_m2_tool_parser.py b/tests/tool_use/test_minimax_m2_tool_parser.py new file mode 100644 index 0000000000000..cf1835b1928b4 --- /dev/null +++ b/tests/tool_use/test_minimax_m2_tool_parser.py @@ -0,0 +1,119 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json + +import pytest + +from vllm.tool_parsers.minimax_m2_tool_parser import ( + MinimaxM2ToolParser, +) + +pytestmark = pytest.mark.cpu_test + + +class FakeTokenizer: + """Minimal fake tokenizer that exposes the attributes used by the + parser: a truthy model_tokenizer marker and a vocab mapping for the + special tokens. + """ + + def __init__(self): + self.model_tokenizer = True + # The parser will look up start/end tokens by their literal strings + self.vocab = { + "": 1, + "": 2, + } + + def get_vocab(self): + return self.vocab + + +@pytest.fixture +def minimax_m2_tool_parser(): + return MinimaxM2ToolParser(FakeTokenizer()) + + +def test_extract_tool_calls_streaming_incremental(minimax_m2_tool_parser): + parser = minimax_m2_tool_parser + parser._reset_streaming_state() + chunks = [ + "", + '', + '', + "Seattle", + "", + ] + previous = "" + for chunk in chunks: + current = previous + chunk + delta = chunk + parser.extract_tool_calls_streaming( + previous_text=previous, + current_text=current, + delta_text=delta, + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[], + request=None, + ) + previous = current + + assert len(parser.prev_tool_call_arr) == 1 + entry = parser.prev_tool_call_arr[0] + + assert entry["name"] == "get_weather" + args = entry["arguments"] + assert args["city"] == "Seattle" + + +def test_streaming_minimax_m2_multiple_invokes(minimax_m2_tool_parser): + parser = minimax_m2_tool_parser + parser._reset_streaming_state() + + chunks = [ + "", + '', + '', + '["technology", "events"]', + '', + '["OpenAI", "latest", "release"]', + "", + '', + '', + '["technology", "events"]', + '', + '["Gemini", "latest", "release"]', + "", + "", + ] + previous = "" + for chunk in chunks: + current = previous + chunk + delta = chunk + parser.extract_tool_calls_streaming( + previous_text=previous, + current_text=current, + delta_text=delta, + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[], + request=None, + ) + previous = current + + assert len(parser.prev_tool_call_arr) == 2 + + for entry, expect_model in zip(parser.prev_tool_call_arr, ["OpenAI", "Gemini"]): + assert entry["name"] == "search_web" + args = json.dumps(entry["arguments"]) + assert "technology" in args and "events" in args + assert expect_model in args + + # check streamed_args_for_tool for serving_chat.py + for index in range(2): + expected_call = parser.prev_tool_call_arr[index].get("arguments", {}) + expected_call = json.dumps(expected_call) + actual_call = parser.streamed_args_for_tool[index] + assert expected_call == actual_call diff --git a/vllm/tool_parsers/minimax_m2_tool_parser.py b/vllm/tool_parsers/minimax_m2_tool_parser.py index dcb2b64f6e73c..a1ab75f548bfc 100644 --- a/vllm/tool_parsers/minimax_m2_tool_parser.py +++ b/vllm/tool_parsers/minimax_m2_tool_parser.py @@ -122,6 +122,8 @@ class MinimaxM2ToolParser(ToolParser): self.streaming_request = None # Clear previous tool call history to avoid state pollution self.prev_tool_call_arr.clear() + # Reset streamed args tracking + self.streamed_args_for_tool.clear() def _extract_name(self, name_str: str) -> str: """Extract name from quoted string.""" @@ -421,9 +423,12 @@ class MinimaxM2ToolParser(ToolParser): self.prev_tool_call_arr.append( { "name": self.current_function_name, - "arguments": "{}", # Placeholder, will be updated later + "arguments": {}, # Placeholder, will be updated later } ) + # Initialize streamed_args_for_tool for this tool call + if len(self.streamed_args_for_tool) <= self.current_tool_index: + self.streamed_args_for_tool.append("") # Send header with function info return DeltaMessage( @@ -445,6 +450,9 @@ class MinimaxM2ToolParser(ToolParser): # Send opening brace if not sent yet if self.in_function and not self.json_started: self.json_started = True + # Update streamed_args_for_tool for opening brace + if self.current_tool_index < len(self.streamed_args_for_tool): + self.streamed_args_for_tool[self.current_tool_index] += "{" return DeltaMessage( tool_calls=[ DeltaToolCall( @@ -493,7 +501,7 @@ class MinimaxM2ToolParser(ToolParser): args = parsed_tool.function.arguments self.prev_tool_call_arr[self.current_tool_index][ "arguments" - ] = args + ] = json.loads(args) except Exception: pass # Ignore parsing errors during streaming @@ -505,7 +513,9 @@ class MinimaxM2ToolParser(ToolParser): ) ] ) - + # Update streamed_args_for_tool for closing brace + if self.current_tool_index < len(self.streamed_args_for_tool): + self.streamed_args_for_tool[self.current_tool_index] += "}" # Reset state for next tool self.json_closed = True self.in_function = False @@ -630,7 +640,11 @@ class MinimaxM2ToolParser(ToolParser): ) self.param_count += 1 - + # Update streamed_args_for_tool for this tool call + if self.current_tool_index < len(self.streamed_args_for_tool): + self.streamed_args_for_tool[self.current_tool_index] += ( + json_fragment + ) return DeltaMessage( tool_calls=[ DeltaToolCall( From a9e15c21efbbc5b4a7a1e69e40378fdfe1acdcb7 Mon Sep 17 00:00:00 2001 From: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com> Date: Wed, 17 Dec 2025 10:48:53 +0200 Subject: [PATCH 008/176] [Mamba] Removed disable cascade attn in MambaModelConfig (#30712) Signed-off-by: asafg <39553475+Josephasafg@users.noreply.github.com> --- vllm/model_executor/models/config.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 4b08472538db4..a3624b1cfa5f2 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -308,12 +308,6 @@ class MambaModelConfig(VerifyAndUpdateConfig): if cache_config.mamba_block_size is None: cache_config.mamba_block_size = model_config.max_model_len - # TODO(tdoublep): remove once cascade attention is supported - logger.info( - "Disabling cascade attention since it is not supported for hybrid models." - ) - model_config.disable_cascade_attn = True - class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig): @classmethod From 3b1d440ede42855f031ba72af4817583e5dddba0 Mon Sep 17 00:00:00 2001 From: Xinyu Chen Date: Wed, 17 Dec 2025 17:43:00 +0800 Subject: [PATCH 009/176] CustomOp: grouped topk (#29575) Signed-off-by: Xinyu Chen --- tests/kernels/moe/test_grouped_topk.py | 10 ++-- .../layers/fused_moe/__init__.py | 4 +- .../layers/fused_moe/fused_moe.py | 52 +++++++++++++++++++ vllm/model_executor/layers/fused_moe/layer.py | 23 +++++--- 4 files changed, 75 insertions(+), 14 deletions(-) diff --git a/tests/kernels/moe/test_grouped_topk.py b/tests/kernels/moe/test_grouped_topk.py index 662e0723b7583..d26fe50b815b4 100644 --- a/tests/kernels/moe/test_grouped_topk.py +++ b/tests/kernels/moe/test_grouped_topk.py @@ -9,8 +9,8 @@ import pytest import torch from vllm.model_executor.layers.fused_moe.fused_moe import ( + GroupedTopk, fused_grouped_topk, - grouped_topk, ) from vllm.platforms import current_platform @@ -50,15 +50,17 @@ def test_grouped_topk( with monkeypatch.context() as m: m.setenv("VLLM_USE_FUSED_MOE_GROUPED_TOPK", "0") - baseline_topk_weights, baseline_topk_ids = grouped_topk( - hidden_states=hidden_states, - gating_output=gating_output, + grouped_topk = GroupedTopk( topk=topk, renormalize=renormalize, num_expert_group=num_expert_group, topk_group=topk_group, scoring_func=scoring_func, routed_scaling_factor=routed_scaling_factor, + ) + baseline_topk_weights, baseline_topk_ids = grouped_topk( + hidden_states=hidden_states, + gating_output=gating_output, e_score_correction_bias=e_score_correction_bias, ) diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index d71cfc5ad8200..8fee4038b60b8 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -77,11 +77,11 @@ if HAS_TRITON: BatchedTritonExperts, ) from vllm.model_executor.layers.fused_moe.fused_moe import ( + GroupedTopk, TritonExperts, fused_experts, fused_topk, get_config_file_name, - grouped_topk, ) from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( TritonOrDeepGemmExperts, @@ -91,7 +91,7 @@ if HAS_TRITON: "fused_topk", "fused_experts", "get_config_file_name", - "grouped_topk", + "GroupedTopk", "cutlass_moe_fp8", "cutlass_moe_fp4", "cutlass_moe_w4a8_fp8", diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index b286c3bc6fc07..20782e2712f27 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -16,6 +16,7 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm import _custom_ops as ops from vllm._aiter_ops import rocm_aiter_ops from vllm.logger import init_logger +from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.batch_invariant import ( vllm_is_batch_invariant, ) @@ -1286,6 +1287,57 @@ def grouped_topk( return topk_weights.to(torch.float32), topk_ids.to(torch.int32) +@CustomOp.register("grouped_topk") +class GroupedTopk(CustomOp): + """GroupedTopk used by the Deepseek-V2 and Deepseek-V3 model.""" + + def __init__( + self, + topk: int, + renormalize: bool, + num_expert_group: int = 0, + topk_group: int = 0, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + ) -> None: + super().__init__() + self.native_impl = grouped_topk + self.topk = topk + self.renormalize = renormalize + self.num_expert_group = num_expert_group + self.topk_group = topk_group + self.scoring_func = scoring_func + self.routed_scaling_factor = routed_scaling_factor + + def forward_native( + self, + hidden_states: torch.Tensor, + gating_output: torch.Tensor, + e_score_correction_bias: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + return self.native_impl( + hidden_states, + gating_output, + self.topk, + self.renormalize, + self.num_expert_group, + self.topk_group, + self.scoring_func, + self.routed_scaling_factor, + e_score_correction_bias, + ) + + def forward_cuda( + self, + hidden_states: torch.Tensor, + gating_output: torch.Tensor, + e_score_correction_bias: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + return self.forward_native( + hidden_states, gating_output, e_score_correction_bias + ) + + @torch.compile(dynamic=True, backend=current_platform.simple_compile_backend) def eplb_map_to_physical_and_record( topk_ids: torch.Tensor, diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index b39ce415a0f83..db97d6eb88ea5 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -67,7 +67,7 @@ else: return topk_ids eplb_map_to_physical_and_record = _eplb_map_to_physical_and_record -from vllm.model_executor.layers.fused_moe.fused_moe import grouped_topk +from vllm.model_executor.layers.fused_moe.fused_moe import GroupedTopk from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501 rocm_aiter_grouped_topk, ) @@ -1594,19 +1594,26 @@ class FusedMoE(CustomOp): grouped_topk_impl = partial( rocm_aiter_grouped_topk, num_fused_shared_experts=self.num_fused_shared_experts, + topk=self.top_k, + renormalize=self.renormalize, + num_expert_group=self.num_expert_group, + topk_group=self.topk_group, + scoring_func=self.scoring_func, + routed_scaling_factor=self.routed_scaling_factor, ) else: - grouped_topk_impl = grouped_topk + grouped_topk_impl = GroupedTopk( + topk=self.top_k, + renormalize=self.renormalize, + num_expert_group=self.num_expert_group, + topk_group=self.topk_group, + scoring_func=self.scoring_func, + routed_scaling_factor=self.routed_scaling_factor, + ) topk_weights, topk_ids = grouped_topk_impl( hidden_states=hidden_states, gating_output=router_logits, - topk=self.top_k, - renormalize=self.renormalize, - num_expert_group=self.num_expert_group, - topk_group=self.topk_group, - scoring_func=self.scoring_func, - routed_scaling_factor=self.routed_scaling_factor, e_score_correction_bias=self.e_score_correction_bias, ) elif self.e_score_correction_bias is not None: From f4e884f2224a25612eaeaeac2a854c1dd330c144 Mon Sep 17 00:00:00 2001 From: Sheng Lin Date: Wed, 17 Dec 2025 17:52:58 +0800 Subject: [PATCH 010/176] [NIXL][Bugfix] Fix NIXL/RDMA registration failure over CuMemAllocator (#29569) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Somoku Co-authored-by: Nicolò Lucchesi --- csrc/cumem_allocator.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/csrc/cumem_allocator.cpp b/csrc/cumem_allocator.cpp index 78dc840a98b67..6c2c18a6602d2 100644 --- a/csrc/cumem_allocator.cpp +++ b/csrc/cumem_allocator.cpp @@ -107,6 +107,16 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem, prop.location.id = device; prop.allocFlags.compressionType = CU_MEM_ALLOCATION_COMP_NONE; +#ifndef USE_ROCM + int flag = 0; + CUDA_CHECK(cuDeviceGetAttribute( + &flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, + device)); + if (flag) { // support GPUDirect RDMA if possible + prop.allocFlags.gpuDirectRDMACapable = 1; + } +#endif + #ifndef USE_ROCM // Allocate memory using cuMemCreate CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0)); From 4c054d89aa5972014ba7e13c0accb0ab631b5638 Mon Sep 17 00:00:00 2001 From: Andrew Xia Date: Wed, 17 Dec 2025 17:53:02 +0800 Subject: [PATCH 011/176] [Doc][ResponsesAPI] add documentation (#30840) Signed-off-by: Andrew Xia Co-authored-by: Andrew Xia --- docs/serving/openai_compatible_server.md | 27 ++++++++++++++++++++++++ vllm/entrypoints/openai/protocol.py | 18 ++++++++++++---- 2 files changed, 41 insertions(+), 4 deletions(-) diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index 0e29204f8947c..6a08f872def15 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -47,6 +47,8 @@ We currently support the following OpenAI APIs: - [Completions API](#completions-api) (`/v1/completions`) - Only applicable to [text generation models](../models/generative_models.md). - *Note: `suffix` parameter is not supported.* +- [Responses API](#responses-api) (`/v1/responses`) + - Only applicable to [text generation models](../models/generative_models.md). - [Chat Completions API](#chat-api) (`/v1/chat/completions`) - Only applicable to [text generation models](../models/generative_models.md) with a [chat template](../serving/openai_compatible_server.md#chat-template). - *Note: `user` parameter is ignored.* @@ -229,6 +231,31 @@ The following extra parameters are supported: --8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-extra-params" ``` +### Responses API + +Our Responses API is compatible with [OpenAI's Responses API](https://platform.openai.com/docs/api-reference/responses); +you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it. + +Code example: [examples/online_serving/openai_responses_client_with_tools.py](../../examples/online_serving/openai_responses_client_with_tools.py) + +#### Extra parameters + +The following extra parameters in the request object are supported: + +??? code + + ```python + --8<-- "vllm/entrypoints/openai/protocol.py:responses-extra-params" + ``` + +The following extra parameters in the response object are supported: + +??? code + + ```python + --8<-- "vllm/entrypoints/openai/protocol.py:responses-response-extra-params" + ``` + ### Embeddings API Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings); diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 94dde4564ea0c..a3c347cb1bd3f 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -1654,13 +1654,23 @@ class ResponsesResponse(OpenAIBaseModel): usage: ResponseUsage | None = None user: str | None = None - # --8<-- [start:responses-extra-params] + # --8<-- [start:responses-response-extra-params] # These are populated when enable_response_messages is set to True # NOTE: custom serialization is needed # see serialize_input_messages and serialize_output_messages - input_messages: ResponseInputOutputMessage | None = None - output_messages: ResponseInputOutputMessage | None = None - # --8<-- [end:responses-extra-params] + input_messages: ResponseInputOutputMessage | None = Field( + default=None, + description=( + "If enable_response_messages, we can show raw token input to model." + ), + ) + output_messages: ResponseInputOutputMessage | None = Field( + default=None, + description=( + "If enable_response_messages, we can show raw token output of model." + ), + ) + # --8<-- [end:responses-response-extra-params] # NOTE: openAI harmony doesn't serialize TextContent properly, # TODO: this fixes for TextContent, but need to verify for tools etc From a100152288c8ec50336aea842f0b3d8e36624024 Mon Sep 17 00:00:00 2001 From: "Ye (Charlotte) Qi" Date: Wed, 17 Dec 2025 01:54:21 -0800 Subject: [PATCH 012/176] [Kernels][FI] Skip trtllm attention when num_kv_heads=1 (#30842) Signed-off-by: Ye (Charlotte) Qi --- .../test_flashinfer_trtllm_attention.py | 35 +++++++++++++++++++ vllm/utils/flashinfer.py | 22 +++++++++++- 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py index 06a7085a82ba0..220d827b9d5fa 100644 --- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py +++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py @@ -455,3 +455,38 @@ def test_flashinfer_trtllm_prefill_with_baseline( torch.testing.assert_close(output, output_trtllm, atol=atol, rtol=rtol), f"{torch.max(torch.abs(output - output_trtllm))}", ) + + +def test_trtllm_attention_rejects_num_kv_heads_1() -> None: + """Test that TRTLLM attention correctly rejects num_kv_heads=1. + + When num_kv_heads=1 (MQA), the KV cache strides become degenerate + (stride_heads == stride_batch), which causes CUDA's cuTensorMapEncodeTiled + to fail because TMA descriptors cannot handle degenerate 4D tensors with + singleton dimensions. + + This test verifies that can_use_trtllm_attention returns False for + num_kv_heads=1 configurations. + """ + from vllm.utils.flashinfer import can_use_trtllm_attention + + # num_kv_heads=1 should be rejected + assert not can_use_trtllm_attention(num_qo_heads=64, num_kv_heads=1), ( + "can_use_trtllm_attention should return False for num_kv_heads=1" + ) + assert not can_use_trtllm_attention(num_qo_heads=32, num_kv_heads=1), ( + "can_use_trtllm_attention should return False for num_kv_heads=1" + ) + + # num_kv_heads > 1 should be accepted (if platform supports it) + # Note: This may return False on non-Blackwell platforms, which is fine + result_kv8 = can_use_trtllm_attention(num_qo_heads=64, num_kv_heads=8) + result_kv1 = can_use_trtllm_attention(num_qo_heads=64, num_kv_heads=1) + + # Even if platform doesn't support TRTLLM, num_kv_heads=1 should never + # return True when num_kv_heads > 1 returns True + if result_kv8: + assert not result_kv1, ( + "If TRTLLM is supported for num_kv_heads=8, " + "it must be rejected for num_kv_heads=1" + ) diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index 1c2710be3173b..6bbe02348eaf1 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -305,7 +305,18 @@ def can_use_trtllm_attention(num_qo_heads: int, num_kv_heads: int) -> bool: if force_use_trtllm_attention() is False: return False has_trtllm = supports_trtllm_attention() - return has_trtllm and (num_qo_heads % num_kv_heads == 0) + # num_kv_heads=1 is not supported due to TMA descriptor building limitations. + # When num_kv_heads=1, the KV cache strides become degenerate (stride_heads == + # stride_batch), which causes CUDA's cuTensorMapEncodeTiled to fail because + # TMA descriptors cannot handle degenerate 4D tensors with singleton dimensions. + # See: https://fburl.com/352mrydz + if has_trtllm and num_kv_heads == 1: + logger.warning_once( + "TRTLLM attention does not support num_kv_heads=1. " + "This configuration causes TMA descriptor building to fail due to " + "degenerate tensor strides. Falling back to FlashInfer attention." + ) + return has_trtllm and (num_qo_heads % num_kv_heads == 0) and (num_kv_heads != 1) def use_trtllm_attention( @@ -355,6 +366,15 @@ def use_trtllm_attention( ) return False + # num_kv_heads=1 is not supported + if num_kv_heads == 1: + if force_use_trtllm: + logger.warning_once( + "TRTLLM attention does not support num_kv_heads=1, " + "but --attention-config.use_trtllm_attention is set to 1" + ) + return False + if has_spec and not is_prefill: # Speculative decoding requires TRTLLM attention for decodes logger.info_once("Using TRTLLM attention (enabled for speculative decoding).") From 519ef9a91111d2d6f8545c8a6b2c1a28d87309fa Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 17 Dec 2025 04:55:30 -0500 Subject: [PATCH 013/176] [UX] Make `vllm bench serve` discover model by default and use --input-len (#30816) Signed-off-by: mgoin --- tests/benchmarks/test_serve_cli.py | 9 ++-- vllm/benchmarks/serve.py | 83 +++++++++++++++++++++++++++--- 2 files changed, 79 insertions(+), 13 deletions(-) diff --git a/tests/benchmarks/test_serve_cli.py b/tests/benchmarks/test_serve_cli.py index 90d685c966d3e..c579b38069864 100644 --- a/tests/benchmarks/test_serve_cli.py +++ b/tests/benchmarks/test_serve_cli.py @@ -19,21 +19,18 @@ def server(): @pytest.mark.benchmark def test_bench_serve(server): + # Test default model detection and input/output len command = [ "vllm", "bench", "serve", - "--model", - MODEL_NAME, "--host", server.host, "--port", str(server.port), - "--dataset-name", - "random", - "--random-input-len", + "--input-len", "32", - "--random-output-len", + "--output-len", "4", "--num-prompts", "5", diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index f5d8ea5a975a9..12756d1700c9f 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -10,8 +10,10 @@ On the client side, run: vllm bench serve \ --backend \ --label \ - --model \ + --model \ --dataset-name \ + --input-len \ + --output-len \ --request-rate \ --num-prompts """ @@ -57,6 +59,33 @@ TERM_PLOTLIB_AVAILABLE = (importlib.util.find_spec("termplotlib") is not None) a ) +async def get_first_model_from_server( + base_url: str, headers: dict | None = None +) -> str: + """Fetch the first model from the server's /v1/models endpoint.""" + models_url = f"{base_url}/v1/models" + async with aiohttp.ClientSession() as session: + try: + async with session.get(models_url, headers=headers) as response: + response.raise_for_status() + data = await response.json() + if "data" in data and len(data["data"]) > 0: + return data["data"][0]["id"] + else: + raise ValueError( + f"No models found on the server at {base_url}. " + "Make sure the server is running and has models loaded." + ) + except (aiohttp.ClientError, json.JSONDecodeError) as e: + raise RuntimeError( + f"Failed to fetch models from server at {models_url}. " + "Check that:\n" + "1. The server is running\n" + "2. The server URL is correct\n" + f"Error: {e}" + ) from e + + class TaskType(Enum): GENERATION = "generation" POOLING = "pooling" @@ -1025,8 +1054,26 @@ def add_cli_args(parser: argparse.ArgumentParser): parser.add_argument( "--model", type=str, - required=True, - help="Name of the model.", + required=False, + default=None, + help="Name of the model. If not specified, will fetch the first model " + "from the server's /v1/models endpoint.", + ) + parser.add_argument( + "--input-len", + type=int, + default=None, + help="General input length for datasets. Maps to dataset-specific " + "input length arguments (e.g., --random-input-len, --sonnet-input-len). " + "If not specified, uses dataset defaults.", + ) + parser.add_argument( + "--output-len", + type=int, + default=None, + help="General output length for datasets. Maps to dataset-specific " + "output length arguments (e.g., --random-output-len, --sonnet-output-len). " + "If not specified, uses dataset defaults.", ) parser.add_argument( "--tokenizer", @@ -1332,10 +1379,6 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: raise ValueError("For exponential ramp-up, the start RPS cannot be 0.") label = args.label - model_id = args.model - model_name = args.served_model_name - tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model - tokenizer_mode = args.tokenizer_mode if args.base_url is not None: api_url = f"{args.base_url}{args.endpoint}" @@ -1356,6 +1399,18 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: else: raise ValueError("Invalid header format. Please use KEY=VALUE format.") + # Fetch model from server if not specified + if args.model is None: + print("Model not specified, fetching first model from server...") + model_id = await get_first_model_from_server(base_url, headers) + print(f"Using model: {model_id}") + else: + model_id = args.model + + model_name = args.served_model_name + tokenizer_id = args.tokenizer if args.tokenizer is not None else model_id + tokenizer_mode = args.tokenizer_mode + tokenizer = get_tokenizer( tokenizer_id, tokenizer_mode=tokenizer_mode, @@ -1368,6 +1423,20 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: "'--dataset-path' if required." ) + # Map general --input-len and --output-len to all dataset-specific arguments + if args.input_len is not None: + args.random_input_len = args.input_len + args.sonnet_input_len = args.input_len + + if args.output_len is not None: + args.random_output_len = args.output_len + args.sonnet_output_len = args.output_len + args.sharegpt_output_len = args.output_len + args.custom_output_len = args.output_len + args.hf_output_len = args.output_len + args.spec_bench_output_len = args.output_len + args.prefix_repetition_output_len = args.output_len + # when using random datasets, default to ignoring EOS # so generation runs to the requested length if ( From 177c391db2ad8dfc05906473525d4ae0a55549e0 Mon Sep 17 00:00:00 2001 From: Zhengxu Chen Date: Wed, 17 Dec 2025 04:55:56 -0500 Subject: [PATCH 014/176] [compile] Disable aot when eager backend is used. (#30810) Signed-off-by: zhxchen17 --- vllm/compilation/decorators.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index d1ee995ee8959..40bde97ac61d8 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -435,7 +435,10 @@ def _support_torch_compile( return self.aot_compiled_fn(self, *args, **kwargs) if self.compiled: - assert not envs.VLLM_USE_AOT_COMPILE + assert ( + not envs.VLLM_USE_AOT_COMPILE + or self.vllm_config.compilation_config.backend == "eager" + ) return TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs) # This is the path for the first compilation. @@ -508,7 +511,11 @@ def _support_torch_compile( _torch27_patch_tensor_subclasses(), torch._inductor.config.patch(**inductor_config_patches), ): - if envs.VLLM_USE_AOT_COMPILE: + use_aot_compile = envs.VLLM_USE_AOT_COMPILE + if self.vllm_config.compilation_config.backend == "eager": + logger.warning("Detected eager backend, disabling AOT compile.") + use_aot_compile = False + if use_aot_compile: self.aot_compiled_fn = self.aot_compile(*args, **kwargs) output = self.aot_compiled_fn(self, *args, **kwargs) assert aot_compilation_path is not None From 9db1db5949f7abd4b03cd0231450f81bfeeaba0f Mon Sep 17 00:00:00 2001 From: Zhengxu Chen Date: Wed, 17 Dec 2025 04:56:24 -0500 Subject: [PATCH 015/176] [compile] Ignore VLLM_FORCE_AOT_LOAD from cache factors (#30809) Signed-off-by: zhxchen17 --- vllm/envs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/envs.py b/vllm/envs.py index 7e072a588591c..2f8158d88d6c5 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -1654,6 +1654,7 @@ def compile_factors() -> dict[str, object]: "VLLM_CI_USE_S3", "VLLM_MODEL_REDIRECT_PATH", "VLLM_HOST_IP", + "VLLM_FORCE_AOT_LOAD", "S3_ACCESS_KEY_ID", "S3_SECRET_ACCESS_KEY", "S3_ENDPOINT_URL", From 7b966ae2ba73b5391937907bfd8aaf63af033ff1 Mon Sep 17 00:00:00 2001 From: danielafrimi <45691845+danielafrimi@users.noreply.github.com> Date: Wed, 17 Dec 2025 11:56:38 +0200 Subject: [PATCH 016/176] [Fix]Load kv-cache dtype from hf_quant_config.json automatically (fix for reverted PR) (#30785) Signed-off-by: <> Co-authored-by: root --- vllm/engine/arg_utils.py | 9 ++++- vllm/utils/torch_utils.py | 75 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ca19e468914c7..03720bd2516d4 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -93,6 +93,7 @@ from vllm.transformers_utils.utils import is_cloud_storage from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.mem_constants import GiB_bytes from vllm.utils.network_utils import get_ip +from vllm.utils.torch_utils import resolve_kv_cache_dtype_string from vllm.v1.sample.logits_processor import LogitsProcessor if TYPE_CHECKING: @@ -106,6 +107,7 @@ else: LoadFormats = Any UsageContext = Any + logger = init_logger(__name__) # object is used to allow for special typing forms @@ -1361,12 +1363,17 @@ class EngineArgs: f"dcp_size={self.decode_context_parallel_size}." ) + # Resolve "auto" kv_cache_dtype to actual value from model config + resolved_cache_dtype = resolve_kv_cache_dtype_string( + self.kv_cache_dtype, model_config + ) + cache_config = CacheConfig( block_size=self.block_size, gpu_memory_utilization=self.gpu_memory_utilization, kv_cache_memory_bytes=self.kv_cache_memory_bytes, swap_space=self.swap_space, - cache_dtype=self.kv_cache_dtype, + cache_dtype=resolved_cache_dtype, is_attention_free=model_config.is_attention_free, num_gpu_blocks_override=self.num_gpu_blocks_override, sliding_window=sliding_window, diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py index c97efce312b56..b82e0171b7f7f 100644 --- a/vllm/utils/torch_utils.py +++ b/vllm/utils/torch_utils.py @@ -24,6 +24,10 @@ else: ModelConfig = object IntermediateTensors = object +import logging + +logger = logging.getLogger(__name__) + STR_DTYPE_TO_TORCH_DTYPE = { "float32": torch.float32, @@ -49,6 +53,13 @@ TORCH_DTYPE_TO_NUMPY_DTYPE = { } +MODELOPT_TO_VLLM_KV_CACHE_DTYPE_MAP = { + # TODO: Add more modelopt kv cache dtype + # mappings here when it supported by some attention backend + # (for example supports nvfp4). + "fp8": "fp8_e4m3", +} + T = TypeVar("T") @@ -194,6 +205,70 @@ def get_kv_cache_torch_dtype( return torch_dtype +def get_kv_cache_quant_algo_string(quant_cfg: dict[str, Any]) -> str | None: + """Get the KV cache quantization algorithm string from the quantization config. + + Maps various FP8 format names to vLLM's standard cache dtype strings. + Returns None if no kv_cache_quant_algo is specified. + Returns "auto" if the value is not recognized/supported. + """ + # Mapping from model config values to vLLM cache_dtype strings + + quant_method = quant_cfg.get("quant_method", "") + if quant_method.startswith("modelopt"): + quantization_inner = quant_cfg.get("quantization", quant_cfg) + # Check if quant config is specified and use kv cache quant algo + kv_algo = quantization_inner.get("kv_cache_quant_algo") or quant_cfg.get( + "kv_cache_quant_algo" + ) + if isinstance(kv_algo, str): + kv_algo_lower = kv_algo.lower() + + # Try to map to vLLM's standard format + if kv_algo_lower in MODELOPT_TO_VLLM_KV_CACHE_DTYPE_MAP: + return MODELOPT_TO_VLLM_KV_CACHE_DTYPE_MAP[kv_algo_lower] + else: + # Unknown/unsupported format - return "auto" as safe fallback + logger.warning( + "WARNING: Unknown kv_cache_quant_algo '%s' in model " + "config. Supported values: %s. Falling back to 'auto'.", + kv_algo, + list(MODELOPT_TO_VLLM_KV_CACHE_DTYPE_MAP.keys()), + ) + return "auto" + return None + + +def get_kv_cache_quant_algo_dtype(quant_cfg: dict[str, Any]) -> torch.dtype | None: + """Get the KV cache quantization algorithm dtype from the quantization config.""" + kv_algo_str = get_kv_cache_quant_algo_string(quant_cfg) + if kv_algo_str is not None and kv_algo_str != "auto": + # Only convert if we have a valid dtype string (not "auto" fallback) + return STR_DTYPE_TO_TORCH_DTYPE[kv_algo_str] + return None + + +def resolve_kv_cache_dtype_string( + kv_cache_dtype: str, model_config: ModelConfig +) -> str: + """Resolve 'auto' kv_cache_dtype to the actual string value from model config. + Returns the resolved cache_dtype string. + """ + if kv_cache_dtype != "auto": + return kv_cache_dtype + + hf_cfg = getattr(model_config, "hf_config", None) + if hf_cfg is not None: + quant_cfg = getattr(hf_cfg, "quantization_config", None) + if quant_cfg is not None: + kv_algo_str = get_kv_cache_quant_algo_string(quant_cfg) + if kv_algo_str is not None: + return kv_algo_str + + # Default to auto (will be handled by downstream code) + return "auto" + + def kv_cache_dtype_str_to_dtype( kv_cache_dtype: str, model_config: ModelConfig ) -> torch.dtype: From 53cd7f868b3632cbbe982cffaee8e16fb49dd694 Mon Sep 17 00:00:00 2001 From: Zhengxu Chen Date: Wed, 17 Dec 2025 05:00:12 -0500 Subject: [PATCH 017/176] [compile] Recompile graph module during Dynamo cache loading. (#30743) Signed-off-by: Zhengxu Chen --- vllm/compilation/caching.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/compilation/caching.py b/vllm/compilation/caching.py index ce482572b401b..fc02a08f74265 100644 --- a/vllm/compilation/caching.py +++ b/vllm/compilation/caching.py @@ -104,6 +104,7 @@ class VllmSerializableFunction(SerializableCallable): state = pickle.loads(data) fake_mode = FakeTensorMode(shape_env=ShapeEnv()) state["graph_module"] = GraphPickler.loads(state["graph_module"], fake_mode) + state["graph_module"].recompile() state["example_inputs"] = GraphPickler.loads(state["example_inputs"], fake_mode) vllm_backend = VllmBackend(get_current_vllm_config(), state["prefix"]) From f284d7bd0c55f929fa7912936b1d247089679191 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Wed, 17 Dec 2025 05:00:35 -0500 Subject: [PATCH 018/176] [Bug] Fix AttributeError: 'ColumnParallelLinear' object has no attribute `weight_scale_inv` (#30823) Signed-off-by: yewentao256 --- vllm/model_executor/layers/quantization/utils/fp8_utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index ea68745585160..bdc3d1fc7232d 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -1437,14 +1437,17 @@ def maybe_post_process_fp8_weight_block(layer: torch.nn.Module): layer.orig_dtype, layer.weight ) if should_use_deepgemm: + scale_attr = ( + "weight_scale_inv" if hasattr(layer, "weight_scale_inv") else "weight_scale" + ) dg_weight, dg_weight_scale = deepgemm_post_process_fp8_weight_block( wq=layer.weight.data, - ws=layer.weight_scale_inv.data, + ws=getattr(layer, scale_attr).data, quant_block_shape=tuple(layer.weight_block_size), use_e8m0=is_deep_gemm_e8m0_used(), ) replace_parameter(layer, "weight", dg_weight) - replace_parameter(layer, "weight_scale_inv", dg_weight_scale) + replace_parameter(layer, scale_attr, dg_weight_scale) def expert_weight_is_col_major(x: torch.Tensor) -> bool: From 9ad5b2171002522772de0a0cc71b747068ec8862 Mon Sep 17 00:00:00 2001 From: Chauncey Date: Wed, 17 Dec 2025 18:27:30 +0800 Subject: [PATCH 019/176] [Refactor] [4/N] Move VLLM_SERVER_DEV endpoints into the serve directory (#30749) Signed-off-by: chaunceyjiang --- .../scripts/hardware_ci/run-amd-test.sh | 1 - .buildkite/test-amd.yaml | 37 ++++--- .buildkite/test-pipeline.yaml | 34 ++++--- .buildkite/test_areas/entrypoints.yaml | 23 ++++- .buildkite/test_areas/tool_use.yaml | 13 --- tests/entrypoints/instrumentator/__init__.py | 0 .../test_metrics.py | 5 +- tests/entrypoints/rpc/__init__.py | 0 .../{openai => rpc}/test_collective_rpc.py | 2 +- tests/entrypoints/sleep/__init__.py | 0 .../{openai => sleep}/test_sleep.py | 2 +- vllm/entrypoints/openai/api_server.py | 98 +------------------ vllm/entrypoints/serve/__init__.py | 29 ++++++ vllm/entrypoints/serve/cache/__init__.py | 0 vllm/entrypoints/serve/cache/api_router.py | 61 ++++++++++++ .../serve/instrumentator/server_info.py | 40 ++++++++ vllm/entrypoints/serve/rpc/__init__.py | 0 vllm/entrypoints/serve/rpc/api_router.py | 61 ++++++++++++ vllm/entrypoints/serve/sleep/api_router.py | 4 - 19 files changed, 259 insertions(+), 151 deletions(-) delete mode 100644 .buildkite/test_areas/tool_use.yaml create mode 100644 tests/entrypoints/instrumentator/__init__.py rename tests/entrypoints/{openai => instrumentator}/test_metrics.py (99%) create mode 100644 tests/entrypoints/rpc/__init__.py rename tests/entrypoints/{openai => rpc}/test_collective_rpc.py (96%) create mode 100644 tests/entrypoints/sleep/__init__.py rename tests/entrypoints/{openai => sleep}/test_sleep.py (98%) create mode 100644 vllm/entrypoints/serve/cache/__init__.py create mode 100644 vllm/entrypoints/serve/cache/api_router.py create mode 100644 vllm/entrypoints/serve/instrumentator/server_info.py create mode 100644 vllm/entrypoints/serve/rpc/__init__.py create mode 100644 vllm/entrypoints/serve/rpc/api_router.py diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index 864eb470bb0a7..08da34d81d117 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -141,7 +141,6 @@ if [[ $commands == *" entrypoints/openai "* ]]; then --ignore=entrypoints/openai/test_audio.py \ --ignore=entrypoints/openai/test_shutdown.py \ --ignore=entrypoints/openai/test_completion.py \ - --ignore=entrypoints/openai/test_sleep.py \ --ignore=entrypoints/openai/test_models.py \ --ignore=entrypoints/openai/test_lora_adapters.py \ --ignore=entrypoints/openai/test_return_tokens_as_ids.py \ diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 3c9b8cbedcf06..e8f99100a8de0 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -128,7 +128,7 @@ steps: - tests/entrypoints/ commands: - pytest -v -s entrypoints/openai/tool_parsers - - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling - label: Entrypoints Integration Test (LLM) # 30min timeout_in_minutes: 40 @@ -148,7 +148,7 @@ steps: - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests -- label: Entrypoints Integration Test (API Server) # 100min +- label: Entrypoints Integration Test (API Server 1) # 100min timeout_in_minutes: 130 mirror_hardwares: [amdexperimental] agent_pool: mi325_1 @@ -162,10 +162,28 @@ steps: - tests/entrypoints/test_chat_utils commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/ + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ - pytest -v -s entrypoints/test_chat_utils.py +- label: Entrypoints Integration Test (API Server 2) + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/entrypoints/sleep + - tests/entrypoints/rpc + - tests/tool_use + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/sleep + - pytest -v -s tool_use + - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc + - label: Entrypoints Integration Test (Pooling) timeout_in_minutes: 50 mirror_hardwares: [amdexperimental] @@ -751,17 +769,6 @@ steps: # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442 - pytest -s entrypoints/openai/correctness/ -- label: OpenAI-Compatible Tool Use # 23 min - timeout_in_minutes: 35 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking - fast_check: false - source_file_dependencies: - - vllm/ - - tests/tool_use - commands: - - pytest -v -s tool_use ##### models test ##### diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 8e6d32f71f220..b4de630b09417 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -114,7 +114,7 @@ steps: - tests/entrypoints/ commands: - pytest -v -s entrypoints/openai/tool_parsers - - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling - label: Entrypoints Integration Test (LLM) # 30min timeout_in_minutes: 40 @@ -132,7 +132,7 @@ steps: - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests -- label: Entrypoints Integration Test (API Server) # 100min +- label: Entrypoints Integration Test (API Server 1) # 100min timeout_in_minutes: 130 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" @@ -144,10 +144,26 @@ steps: - tests/entrypoints/test_chat_utils commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/ + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ - pytest -v -s entrypoints/test_chat_utils.py +- label: Entrypoints Integration Test (API Server 2) + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + working_dir: "/vllm-workspace/tests" + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/entrypoints/sleep + - tests/entrypoints/rpc + - tests/tool_use + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/sleep + - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc + - pytest -v -s tool_use + - label: Entrypoints Integration Test (Pooling) timeout_in_minutes: 50 mirror_hardwares: [amdexperimental] @@ -666,16 +682,6 @@ steps: commands: # LMEval+Transcription WER check - pytest -s entrypoints/openai/correctness/ -- label: OpenAI-Compatible Tool Use # 23 min - timeout_in_minutes: 35 - mirror_hardwares: [amdexperimental] - fast_check: false - source_file_dependencies: - - vllm/ - - tests/tool_use - commands: - - pytest -v -s tool_use - ##### models test ##### - label: Basic Models Tests (Initialization) diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml index 0a789be943f37..5b16ea9c1ad07 100644 --- a/.buildkite/test_areas/entrypoints.yaml +++ b/.buildkite/test_areas/entrypoints.yaml @@ -10,7 +10,7 @@ steps: - tests/entrypoints/ commands: - pytest -v -s entrypoints/openai/tool_parsers - - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling - label: Entrypoints Integration (LLM) timeout_in_minutes: 40 @@ -25,7 +25,7 @@ steps: - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests -- label: Entrypoints Integration (API Server) +- label: Entrypoints Integration (API Server 1) timeout_in_minutes: 130 working_dir: "/vllm-workspace/tests" source_file_dependencies: @@ -34,11 +34,26 @@ steps: - tests/entrypoints/test_chat_utils commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/ + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ - pytest -v -s entrypoints/test_chat_utils.py +- label: Entrypoints Integration (API Server 2) + timeout_in_minutes: 130 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/tool_use + - tests/entrypoints/sleep + - tests/entrypoints/instrumentator + - tests/entrypoints/rpc + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc + - pytest -v -s entrypoints/instrumentator + - pytest -v -s entrypoints/sleep + - pytest -v -s tool_use + - label: Entrypoints Integration (Pooling) timeout_in_minutes: 50 working_dir: "/vllm-workspace/tests" diff --git a/.buildkite/test_areas/tool_use.yaml b/.buildkite/test_areas/tool_use.yaml deleted file mode 100644 index 69527a1214229..0000000000000 --- a/.buildkite/test_areas/tool_use.yaml +++ /dev/null @@ -1,13 +0,0 @@ -group: Tool use -depends_on: - - image-build -steps: -- label: OpenAI-Compatible Tool Use - timeout_in_minutes: 35 - mirror_hardwares: [amdexperimental] - fast_check: false - source_file_dependencies: - - vllm/ - - tests/tool_use - commands: - - pytest -v -s tool_use diff --git a/tests/entrypoints/instrumentator/__init__.py b/tests/entrypoints/instrumentator/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/instrumentator/test_metrics.py similarity index 99% rename from tests/entrypoints/openai/test_metrics.py rename to tests/entrypoints/instrumentator/test_metrics.py index 65a6fd20bd0d1..9f2ad105a380b 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/instrumentator/test_metrics.py @@ -14,11 +14,10 @@ import requests from prometheus_client.parser import text_string_to_metric_families from transformers import AutoTokenizer +from tests.conftest import LocalAssetServer +from tests.utils import RemoteOpenAIServer from vllm import version -from ...conftest import LocalAssetServer -from ...utils import RemoteOpenAIServer - MODELS = { "text": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "multimodal": "HuggingFaceTB/SmolVLM-256M-Instruct", diff --git a/tests/entrypoints/rpc/__init__.py b/tests/entrypoints/rpc/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/entrypoints/openai/test_collective_rpc.py b/tests/entrypoints/rpc/test_collective_rpc.py similarity index 96% rename from tests/entrypoints/openai/test_collective_rpc.py rename to tests/entrypoints/rpc/test_collective_rpc.py index cbd6b02f05dce..56d93a427315f 100644 --- a/tests/entrypoints/openai/test_collective_rpc.py +++ b/tests/entrypoints/rpc/test_collective_rpc.py @@ -37,7 +37,7 @@ def server(): "--max-num-seqs", "128", "--worker-extension-cls", - "tests.entrypoints.openai.test_collective_rpc.TestWorkerExtension", + "tests.entrypoints.rpc.test_collective_rpc.TestWorkerExtension", ] with RemoteOpenAIServer( MODEL_NAME, diff --git a/tests/entrypoints/sleep/__init__.py b/tests/entrypoints/sleep/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/entrypoints/openai/test_sleep.py b/tests/entrypoints/sleep/test_sleep.py similarity index 98% rename from tests/entrypoints/openai/test_sleep.py rename to tests/entrypoints/sleep/test_sleep.py index 5f94ac6da2c25..260dcd00bae91 100644 --- a/tests/entrypoints/openai/test_sleep.py +++ b/tests/entrypoints/sleep/test_sleep.py @@ -4,7 +4,7 @@ import requests from prometheus_client.parser import text_string_to_metric_families -from ...utils import RemoteOpenAIServer +from tests.utils import RemoteOpenAIServer MODEL_NAME = "meta-llama/Llama-3.2-1B" diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 5d0eacae34dd7..bca9571e39344 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -17,21 +17,20 @@ from argparse import Namespace from collections.abc import AsyncGenerator, AsyncIterator, Awaitable from contextlib import asynccontextmanager from http import HTTPStatus -from typing import Annotated, Any, Literal +from typing import Annotated, Any import model_hosting_container_standards.sagemaker as sagemaker_standards import pydantic import uvloop -from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Query, Request +from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request from fastapi.exceptions import RequestValidationError from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import JSONResponse, Response, StreamingResponse +from fastapi.responses import JSONResponse, StreamingResponse from starlette.concurrency import iterate_in_threadpool from starlette.datastructures import URL, Headers, MutableHeaders, State from starlette.types import ASGIApp, Message, Receive, Scope, Send import vllm.envs as envs -from vllm.config import VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.protocol import EngineClient from vllm.entrypoints.anthropic.protocol import ( @@ -639,97 +638,6 @@ async def create_translations( return StreamingResponse(content=generator, media_type="text/event-stream") -if envs.VLLM_SERVER_DEV_MODE: - logger.warning( - "SECURITY WARNING: Development endpoints are enabled! " - "This should NOT be used in production!" - ) - - PydanticVllmConfig = pydantic.TypeAdapter(VllmConfig) - - @router.get("/server_info") - async def show_server_info( - raw_request: Request, - config_format: Annotated[Literal["text", "json"], Query()] = "text", - ): - vllm_config: VllmConfig = raw_request.app.state.vllm_config - server_info = { - "vllm_config": str(vllm_config) - if config_format == "text" - else PydanticVllmConfig.dump_python(vllm_config, mode="json", fallback=str) - # fallback=str is needed to handle e.g. torch.dtype - } - return JSONResponse(content=server_info) - - @router.post("/reset_prefix_cache") - async def reset_prefix_cache( - raw_request: Request, - reset_running_requests: bool = Query(default=False), - reset_external: bool = Query(default=False), - ): - """ - Reset the local prefix cache. - - Optionally, if the query parameter `reset_external=true` - also resets the external (connector-managed) prefix cache. - - Note that we currently do not check if the prefix cache - is successfully reset in the API server. - - Example: - POST /reset_prefix_cache?reset_external=true - """ - logger.info("Resetting prefix cache...") - - await engine_client(raw_request).reset_prefix_cache( - reset_running_requests, reset_external - ) - return Response(status_code=200) - - @router.post("/reset_mm_cache") - async def reset_mm_cache(raw_request: Request): - """ - Reset the multi-modal cache. Note that we currently do not check if the - multi-modal cache is successfully reset in the API server. - """ - logger.info("Resetting multi-modal cache...") - await engine_client(raw_request).reset_mm_cache() - return Response(status_code=200) - - @router.post("/collective_rpc") - async def collective_rpc(raw_request: Request): - try: - body = await raw_request.json() - except json.JSONDecodeError as e: - raise HTTPException( - status_code=HTTPStatus.BAD_REQUEST.value, - detail=f"JSON decode error: {e}", - ) from e - method = body.get("method") - if method is None: - raise HTTPException( - status_code=HTTPStatus.BAD_REQUEST.value, - detail="Missing 'method' in request body", - ) - # For security reason, only serialized string args/kwargs are passed. - # User-defined `method` is responsible for deserialization if needed. - args: list[str] = body.get("args", []) - kwargs: dict[str, str] = body.get("kwargs", {}) - timeout: float | None = body.get("timeout") - results = await engine_client(raw_request).collective_rpc( - method=method, timeout=timeout, args=tuple(args), kwargs=kwargs - ) - if results is None: - return Response(status_code=200) - response: list[Any] = [] - for result in results: - if result is None or isinstance(result, dict | list): - response.append(result) - else: - response.append(str(result)) - return JSONResponse(content={"results": response}) - - def load_log_config(log_config_file: str | None) -> dict | None: if not log_config_file: return None diff --git a/vllm/entrypoints/serve/__init__.py b/vllm/entrypoints/serve/__init__.py index c4fcc92db931f..260fd44a02ccb 100644 --- a/vllm/entrypoints/serve/__init__.py +++ b/vllm/entrypoints/serve/__init__.py @@ -4,8 +4,19 @@ from fastapi import FastAPI +import vllm.envs as envs +from vllm.logger import init_logger + +logger = init_logger(__name__) + def register_vllm_serve_api_routers(app: FastAPI): + if envs.VLLM_SERVER_DEV_MODE: + logger.warning( + "SECURITY WARNING: Development endpoints are enabled! " + "This should NOT be used in production!" + ) + from vllm.entrypoints.serve.lora.api_router import ( attach_router as attach_lora_router, ) @@ -29,6 +40,18 @@ def register_vllm_serve_api_routers(app: FastAPI): attach_sleep_router(app) + from vllm.entrypoints.serve.rpc.api_router import ( + attach_router as attach_rpc_router, + ) + + attach_rpc_router(app) + + from vllm.entrypoints.serve.cache.api_router import ( + attach_router as attach_cache_router, + ) + + attach_cache_router(app) + from vllm.entrypoints.serve.tokenize.api_router import ( attach_router as attach_tokenize_router, ) @@ -58,3 +81,9 @@ def register_vllm_serve_api_routers(app: FastAPI): ) attach_health_router(app) + + from vllm.entrypoints.serve.instrumentator.server_info import ( + attach_router as attach_server_info_router, + ) + + attach_server_info_router(app) diff --git a/vllm/entrypoints/serve/cache/__init__.py b/vllm/entrypoints/serve/cache/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/entrypoints/serve/cache/api_router.py b/vllm/entrypoints/serve/cache/api_router.py new file mode 100644 index 0000000000000..d659895463273 --- /dev/null +++ b/vllm/entrypoints/serve/cache/api_router.py @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +from fastapi import APIRouter, FastAPI, Query, Request +from fastapi.responses import Response + +import vllm.envs as envs +from vllm.engine.protocol import EngineClient +from vllm.logger import init_logger + +logger = init_logger(__name__) + +router = APIRouter() + + +def engine_client(request: Request) -> EngineClient: + return request.app.state.engine_client + + +@router.post("/reset_prefix_cache") +async def reset_prefix_cache( + raw_request: Request, + reset_running_requests: bool = Query(default=False), + reset_external: bool = Query(default=False), +): + """ + Reset the local prefix cache. + + Optionally, if the query parameter `reset_external=true` + also resets the external (connector-managed) prefix cache. + + Note that we currently do not check if the prefix cache + is successfully reset in the API server. + + Example: + POST /reset_prefix_cache?reset_external=true + """ + logger.info("Resetting prefix cache...") + + await engine_client(raw_request).reset_prefix_cache( + reset_running_requests, reset_external + ) + return Response(status_code=200) + + +@router.post("/reset_mm_cache") +async def reset_mm_cache(raw_request: Request): + """ + Reset the multi-modal cache. Note that we currently do not check if the + multi-modal cache is successfully reset in the API server. + """ + logger.info("Resetting multi-modal cache...") + await engine_client(raw_request).reset_mm_cache() + return Response(status_code=200) + + +def attach_router(app: FastAPI): + if not envs.VLLM_SERVER_DEV_MODE: + return + app.include_router(router) diff --git a/vllm/entrypoints/serve/instrumentator/server_info.py b/vllm/entrypoints/serve/instrumentator/server_info.py new file mode 100644 index 0000000000000..1a69dfacae1c2 --- /dev/null +++ b/vllm/entrypoints/serve/instrumentator/server_info.py @@ -0,0 +1,40 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +from typing import Annotated, Literal + +import pydantic +from fastapi import APIRouter, FastAPI, Query, Request +from fastapi.responses import JSONResponse + +import vllm.envs as envs +from vllm.config import VllmConfig +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +router = APIRouter() +PydanticVllmConfig = pydantic.TypeAdapter(VllmConfig) + + +@router.get("/server_info") +async def show_server_info( + raw_request: Request, + config_format: Annotated[Literal["text", "json"], Query()] = "text", +): + vllm_config: VllmConfig = raw_request.app.state.vllm_config + server_info = { + "vllm_config": str(vllm_config) + if config_format == "text" + else PydanticVllmConfig.dump_python(vllm_config, mode="json", fallback=str) + # fallback=str is needed to handle e.g. torch.dtype + } + return JSONResponse(content=server_info) + + +def attach_router(app: FastAPI): + if not envs.VLLM_SERVER_DEV_MODE: + return + app.include_router(router) diff --git a/vllm/entrypoints/serve/rpc/__init__.py b/vllm/entrypoints/serve/rpc/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/entrypoints/serve/rpc/api_router.py b/vllm/entrypoints/serve/rpc/api_router.py new file mode 100644 index 0000000000000..54f582c408d54 --- /dev/null +++ b/vllm/entrypoints/serve/rpc/api_router.py @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json +from http import HTTPStatus +from typing import Any + +from fastapi import APIRouter, FastAPI, HTTPException, Request +from fastapi.responses import JSONResponse, Response + +import vllm.envs as envs +from vllm.engine.protocol import EngineClient +from vllm.logger import init_logger + +logger = init_logger(__name__) + +router = APIRouter() + + +def engine_client(request: Request) -> EngineClient: + return request.app.state.engine_client + + +@router.post("/collective_rpc") +async def collective_rpc(raw_request: Request): + try: + body = await raw_request.json() + except json.JSONDecodeError as e: + raise HTTPException( + status_code=HTTPStatus.BAD_REQUEST.value, + detail=f"JSON decode error: {e}", + ) from e + method = body.get("method") + if method is None: + raise HTTPException( + status_code=HTTPStatus.BAD_REQUEST.value, + detail="Missing 'method' in request body", + ) + # For security reason, only serialized string args/kwargs are passed. + # User-defined `method` is responsible for deserialization if needed. + args: list[str] = body.get("args", []) + kwargs: dict[str, str] = body.get("kwargs", {}) + timeout: float | None = body.get("timeout") + results = await engine_client(raw_request).collective_rpc( + method=method, timeout=timeout, args=tuple(args), kwargs=kwargs + ) + if results is None: + return Response(status_code=200) + response: list[Any] = [] + for result in results: + if result is None or isinstance(result, dict | list): + response.append(result) + else: + response.append(str(result)) + return JSONResponse(content={"results": response}) + + +def attach_router(app: FastAPI): + if not envs.VLLM_SERVER_DEV_MODE: + return + app.include_router(router) diff --git a/vllm/entrypoints/serve/sleep/api_router.py b/vllm/entrypoints/serve/sleep/api_router.py index bc01e185315c8..c0e4c3028b2ea 100644 --- a/vllm/entrypoints/serve/sleep/api_router.py +++ b/vllm/entrypoints/serve/sleep/api_router.py @@ -52,9 +52,5 @@ async def is_sleeping(raw_request: Request): def attach_router(app: FastAPI): if not envs.VLLM_SERVER_DEV_MODE: return - logger.warning( - "SECURITY WARNING: Development endpoints are enabled! " - "This should NOT be used in production!" - ) app.include_router(router) From 4bf6c2366818a1eeae257e06ec337039e6895f13 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Wed, 17 Dec 2025 02:30:56 -0800 Subject: [PATCH 020/176] [ci] Sync test areas yaml file with test-pipeline (#30862) Signed-off-by: Kevin H. Luu --- .buildkite/test_areas/e2e_integration.yaml | 19 +------------------ .buildkite/test_areas/lm_eval.yaml | 4 ++-- .buildkite/test_areas/lora.yaml | 2 ++ .buildkite/test_areas/models_basic.yaml | 2 ++ .buildkite/test_areas/pytorch.yaml | 4 +++- 5 files changed, 10 insertions(+), 21 deletions(-) diff --git a/.buildkite/test_areas/e2e_integration.yaml b/.buildkite/test_areas/e2e_integration.yaml index 93d389815edac..2e0857986c3fa 100644 --- a/.buildkite/test_areas/e2e_integration.yaml +++ b/.buildkite/test_areas/e2e_integration.yaml @@ -32,6 +32,7 @@ steps: - label: Prime-RL Integration (2 GPUs) timeout_in_minutes: 30 optional: true + soft_fail: true num_gpus: 2 working_dir: "/vllm-workspace" source_file_dependencies: @@ -39,21 +40,3 @@ steps: - .buildkite/scripts/run-prime-rl-test.sh commands: - bash .buildkite/scripts/run-prime-rl-test.sh - -- label: DeepSeek V2-Lite Async EPLB Accuracy - timeout_in_minutes: 60 - gpu: h100 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace" - commands: - - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030 - -- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy - timeout_in_minutes: 60 - gpu: h100 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace" - commands: - - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040 diff --git a/.buildkite/test_areas/lm_eval.yaml b/.buildkite/test_areas/lm_eval.yaml index 9af43e0c375a8..e2498512bdef7 100644 --- a/.buildkite/test_areas/lm_eval.yaml +++ b/.buildkite/test_areas/lm_eval.yaml @@ -9,7 +9,7 @@ steps: - vllm/model_executor/layers/quantization autorun_on_main: true commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt - label: LM Eval Large Models (4 GPUs)(A100) gpu: a100 @@ -43,4 +43,4 @@ steps: - csrc/ - vllm/model_executor/layers/quantization commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1 + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt diff --git a/.buildkite/test_areas/lora.yaml b/.buildkite/test_areas/lora.yaml index 809b4138f44ba..59ade40cc8f52 100644 --- a/.buildkite/test_areas/lora.yaml +++ b/.buildkite/test_areas/lora.yaml @@ -22,6 +22,8 @@ steps: # FIXIT: find out which code initialize cuda before running the test # before the fix, we need to use spawn to test it - export VLLM_WORKER_MULTIPROC_METHOD=spawn + # Alot of these tests are on the edge of OOMing + - export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True # There is some Tensor Parallelism related processing logic in LoRA that # requires multi-GPU testing for validation. - pytest -v -s -x lora/test_chatglm3_tp.py diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml index 39a5d51c48833..2a86596a6d603 100644 --- a/.buildkite/test_areas/models_basic.yaml +++ b/.buildkite/test_areas/models_basic.yaml @@ -9,6 +9,7 @@ steps: source_file_dependencies: - vllm/ - tests/models/test_initialization.py + - tests/models/registry.py commands: # Run a subset of model initialization tests - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset @@ -20,6 +21,7 @@ steps: source_file_dependencies: - vllm/model_executor/models/ - tests/models/test_initialization.py + - tests/models/registry.py commands: # Only when vLLM model source is modified - test initialization of a large # subset of supported models (the complement of the small subset in the above diff --git a/.buildkite/test_areas/pytorch.yaml b/.buildkite/test_areas/pytorch.yaml index 703c82eb1a91b..332d5202d8338 100644 --- a/.buildkite/test_areas/pytorch.yaml +++ b/.buildkite/test_areas/pytorch.yaml @@ -13,7 +13,9 @@ steps: # tests covered elsewhere. # Use `find` to launch multiple instances of pytest so that # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 - - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\;" + # However, find does not normally propagate error codes, so we combine it with xargs + # (using -0 for proper path handling) + - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'" - label: PyTorch Fullgraph Smoke Test timeout_in_minutes: 30 From 84896fda22d3de74398a88b5769c98eef14258f1 Mon Sep 17 00:00:00 2001 From: baoqian426 <1354987947@qq.com> Date: Wed, 17 Dec 2025 19:32:34 +0800 Subject: [PATCH 021/176] [Bugfix] deepseek-V3.2 self.weights_proj has no bias (#30841) Signed-off-by: baoqian <1354987947@qq.com> Signed-off-by: baoqian426 <1354987947@qq.com> --- vllm/model_executor/models/deepseek_v2.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 146124153c79d..6670143cda250 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -835,7 +835,11 @@ class Indexer(nn.Module): ) self.k_norm = LayerNorm(self.head_dim, eps=1e-6) self.weights_proj = ReplicatedLinear( - hidden_size, self.n_head, quant_config=None, prefix=f"{prefix}.weights_proj" + hidden_size, + self.n_head, + bias=False, + quant_config=None, + prefix=f"{prefix}.weights_proj", ) self.softmax_scale = self.head_dim**-0.5 From fb980eb2fdd15f81d4c5695347bdea308bb5515e Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 17 Dec 2025 11:33:50 +0000 Subject: [PATCH 022/176] Fix lazy import (#30858) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/v1/structured_output/utils.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py index cb5ad99cfbdf7..74df0fa067670 100644 --- a/vllm/v1/structured_output/utils.py +++ b/vllm/v1/structured_output/utils.py @@ -20,9 +20,9 @@ from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput if TYPE_CHECKING: import outlines_core as oc + import transformers.convert_slow_tokenizer as convert_slow_tokenizer import transformers.file_utils as file_utils import xgrammar as xgr - from transformers.convert_slow_tokenizer import bytes_to_unicode from vllm.tokenizers import TokenizerLike from vllm.v1.worker.gpu_input_batch import InputBatch @@ -30,8 +30,8 @@ else: xgr = LazyLoader("xgr", globals(), "xgrammar") oc = LazyLoader("oc", globals(), "outlines_core") file_utils = LazyLoader("file_utils", globals(), "transformers.file_utils") - bytes_to_unicode = LazyLoader( - "bytes_to_unicode", globals(), "transformers.convert_slow_tokenizer" + convert_slow_tokenizer = LazyLoader( + "convert_slow_tokenizer", globals(), "transformers.convert_slow_tokenizer" ) TokenizerLike = object @@ -202,7 +202,9 @@ def _reduced_vocabulary( A Dict of token string -> equivalent token ids """ - unicode_to_bytes = {v: k for k, v in bytes_to_unicode().items()} + unicode_to_bytes = { + v: k for k, v in convert_slow_tokenizer.bytes_to_unicode().items() + } def convert_token_to_string(token: str) -> str: string = tokenizer.convert_tokens_to_string([token]) From 6482e3895baa483fb30227648aa4721f09699cba Mon Sep 17 00:00:00 2001 From: Hank_ <37239608+ILikeIneine@users.noreply.github.com> Date: Wed, 17 Dec 2025 19:58:16 +0800 Subject: [PATCH 023/176] chores: adjust the attn register param order (#30688) Signed-off-by: Hank --- vllm/attention/backends/registry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/attention/backends/registry.py b/vllm/attention/backends/registry.py index eaa0fa1d5db39..ed0021db204ac 100644 --- a/vllm/attention/backends/registry.py +++ b/vllm/attention/backends/registry.py @@ -201,8 +201,8 @@ _MAMBA_ATTN_OVERRIDES: dict[MambaAttentionBackendEnum, str] = {} def register_backend( backend: AttentionBackendEnum | MambaAttentionBackendEnum, - is_mamba: bool = False, class_path: str | None = None, + is_mamba: bool = False, ) -> Callable[[type], type]: """Register or override a backend implementation. From 6e9dbcc50e35af75ec76bf033ee6402697c02609 Mon Sep 17 00:00:00 2001 From: Jialin Ouyang Date: Wed, 17 Dec 2025 03:58:43 -0800 Subject: [PATCH 024/176] [Fix] uniform decode batch check (#30747) Signed-off-by: Jialin Ouyang --- tests/v1/worker/test_gpu_model_runner.py | 84 ++++++++++++++++++++++++ vllm/v1/worker/gpu_model_runner.py | 45 ++++++++++--- 2 files changed, 121 insertions(+), 8 deletions(-) diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 7b8c4268a5237..59f1ac705829f 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -1110,3 +1110,87 @@ def test_hybrid_cache_integration(model_runner, dist_init): runner._update_states(scheduler_output) assert _is_req_scheduled(runner, req_id) assert _is_req_state_block_table_match(runner, req_id) + + +def test_is_uniform_decode() -> None: + # Normal + assert GPUModelRunner._is_uniform_decode( + max_num_scheduled_tokens=1, + uniform_decode_query_len=1, + num_tokens=16, + num_reqs=16, + ) + assert not GPUModelRunner._is_uniform_decode( + max_num_scheduled_tokens=2, + uniform_decode_query_len=1, + num_tokens=16, + num_reqs=16, + ) + assert not GPUModelRunner._is_uniform_decode( + max_num_scheduled_tokens=1, + uniform_decode_query_len=1, + num_tokens=16, + num_reqs=15, + ) + # Spec decoding + assert GPUModelRunner._is_uniform_decode( + max_num_scheduled_tokens=5, + uniform_decode_query_len=5, + num_tokens=30, + num_reqs=6, + ) + assert not GPUModelRunner._is_uniform_decode( + max_num_scheduled_tokens=5, + uniform_decode_query_len=4, + num_tokens=30, + num_reqs=6, + ) + assert not GPUModelRunner._is_uniform_decode( + max_num_scheduled_tokens=5, + uniform_decode_query_len=5, + num_tokens=30, + num_reqs=7, + ) + # Force uniform decode + assert GPUModelRunner._is_uniform_decode( + max_num_scheduled_tokens=1, + uniform_decode_query_len=1, + num_tokens=16, + num_reqs=16, + force_uniform_decode=True, + ) + assert GPUModelRunner._is_uniform_decode( + max_num_scheduled_tokens=2, + uniform_decode_query_len=1, + num_tokens=16, + num_reqs=16, + force_uniform_decode=True, + ) + assert GPUModelRunner._is_uniform_decode( + max_num_scheduled_tokens=1, + uniform_decode_query_len=1, + num_tokens=16, + num_reqs=15, + force_uniform_decode=True, + ) + assert not GPUModelRunner._is_uniform_decode( + max_num_scheduled_tokens=1, + uniform_decode_query_len=1, + num_tokens=16, + num_reqs=16, + force_uniform_decode=False, + ) + assert not GPUModelRunner._is_uniform_decode( + max_num_scheduled_tokens=2, + uniform_decode_query_len=1, + num_tokens=16, + num_reqs=16, + force_uniform_decode=False, + ) + assert not GPUModelRunner._is_uniform_decode( + max_num_scheduled_tokens=1, + uniform_decode_query_len=1, + num_tokens=16, + num_reqs=15, + force_uniform_decode=False, + ) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 1db5bc99fff6c..a44150432434b 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2777,6 +2777,27 @@ class GPUModelRunner( **model_kwargs, ) + @staticmethod + def _is_uniform_decode( + max_num_scheduled_tokens: int, + uniform_decode_query_len: int, + num_tokens: int, + num_reqs: int, + force_uniform_decode: bool | None = None, + ) -> bool: + """ + Checks if it's a decode batch with same amount scheduled tokens + across all requests. + """ + return ( + ( + (max_num_scheduled_tokens == uniform_decode_query_len) + and (num_tokens == max_num_scheduled_tokens * num_reqs) + ) + if force_uniform_decode is None + else force_uniform_decode + ) + def _determine_batch_execution_and_padding( self, num_tokens: int, @@ -2798,14 +2819,12 @@ class GPUModelRunner( torch.Tensor | None, CUDAGraphStat | None, ]: - num_tokens_padded = self._pad_for_sequence_parallelism(num_tokens) - uniform_decode = ( - ( - (max_num_scheduled_tokens == self.uniform_decode_query_len) - and (num_tokens_padded == max_num_scheduled_tokens * num_reqs) - ) - if force_uniform_decode is None - else force_uniform_decode + uniform_decode = self._is_uniform_decode( + max_num_scheduled_tokens=max_num_scheduled_tokens, + uniform_decode_query_len=self.uniform_decode_query_len, + num_tokens=num_tokens, + num_reqs=num_reqs, + force_uniform_decode=force_uniform_decode, ) # Encoder-decoder models only support CG for decoder_step > 0 (no enc_output # is present). Also, chunked-prefill is disabled, so batch are uniform. @@ -2819,6 +2838,7 @@ class GPUModelRunner( else force_has_lora ) + num_tokens_padded = self._pad_for_sequence_parallelism(num_tokens) dispatch_cudagraph = ( lambda num_tokens, disable_full: self.cudagraph_dispatcher.dispatch( num_tokens=num_tokens, @@ -2834,6 +2854,15 @@ class GPUModelRunner( num_tokens_padded, use_cascade_attn or has_encoder_output ) num_tokens_padded = batch_descriptor.num_tokens + if self.compilation_config.pass_config.enable_sp: + assert ( + batch_descriptor.num_tokens + % self.vllm_config.parallel_config.tensor_parallel_size + == 0 + ), ( + "Sequence parallelism requires num_tokens to be " + "a multiple of tensor parallel size" + ) # Extra coordination when running data-parallel since we need to coordinate # across ranks From 9e67c4ce985b0b8852603cfe3fcaf8f37de137ed Mon Sep 17 00:00:00 2001 From: "rongfu.leng" Date: Wed, 17 Dec 2025 20:14:45 +0800 Subject: [PATCH 025/176] [Docs] fix function name (#30748) Signed-off-by: rongfu.leng --- docs/design/plugin_system.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md index b0ca2dad23d5b..0fd448c2153c3 100644 --- a/docs/design/plugin_system.md +++ b/docs/design/plugin_system.md @@ -109,7 +109,7 @@ Every plugin has three parts: - `init_device`: This function is called to set up the device for the worker. - `initialize_cache`: This function is called to set cache config for the worker. - `load_model`: This function is called to load the model weights to device. - - `get_kv_cache_spaces`: This function is called to generate the kv cache spaces for the model. + - `get_kv_cache_spec`: This function is called to generate the kv cache spec for the model. - `determine_available_memory`: This function is called to profiles the peak memory usage of the model to determine how much memory can be used for KV cache without OOMs. - `initialize_from_config`: This function is called to allocate device KV cache with the specified kv_cache_config - `execute_model`: This function is called every step to inference the model. From b7b6a60aca0405b2d6b2ed6fd13853635f000b5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E9=91=AB=E5=B4=A7?= <50285788+SongDI911@users.noreply.github.com> Date: Wed, 17 Dec 2025 23:10:59 +0800 Subject: [PATCH 026/176] Adapt the old parameter enable_thinking in chat_template_kwargs (#30852) Signed-off-by: xinsong.gao <1418762819@qq.com> Co-authored-by: Chauncey --- vllm/reasoning/deepseek_v3_reasoning_parser.py | 2 ++ vllm/tokenizers/deepseek_v32.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/vllm/reasoning/deepseek_v3_reasoning_parser.py b/vllm/reasoning/deepseek_v3_reasoning_parser.py index 6604f70badbcf..4e6758586bf42 100644 --- a/vllm/reasoning/deepseek_v3_reasoning_parser.py +++ b/vllm/reasoning/deepseek_v3_reasoning_parser.py @@ -26,6 +26,8 @@ class DeepSeekV3ReasoningParser(ReasoningParser): chat_kwargs = kwargs.pop("chat_template_kwargs", {}) or {} thinking = bool(chat_kwargs.pop("thinking", False)) + enable_thinking = bool(chat_kwargs.pop("enable_thinking", False)) + thinking = thinking or enable_thinking if thinking: self._parser = DeepSeekR1ReasoningParser(tokenizer, *args, **kwargs) diff --git a/vllm/tokenizers/deepseek_v32.py b/vllm/tokenizers/deepseek_v32.py index bf279a5cf67c5..d519b61ddb76d 100644 --- a/vllm/tokenizers/deepseek_v32.py +++ b/vllm/tokenizers/deepseek_v32.py @@ -50,6 +50,8 @@ class DeepseekV32Tokenizer(CachedHfTokenizer): **kwargs, ) -> str | list[int]: thinking = kwargs.get("thinking", False) + enable_thinking = kwargs.get("enable_thinking", False) + thinking = thinking or enable_thinking thinking_mode = "thinking" if not thinking: thinking_mode = "chat" From 196cdc3224112df7f68c901fe4c5314875a65be8 Mon Sep 17 00:00:00 2001 From: KimHyemin <102578109+www-spam@users.noreply.github.com> Date: Thu, 18 Dec 2025 00:11:18 +0900 Subject: [PATCH 027/176] [Model] Gemma3: Support untied word embeddings (#30827) Signed-off-by: www-spam --- vllm/model_executor/models/gemma3.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index 40f6d100c767e..70f72b5cb9beb 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -39,7 +39,10 @@ from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, + VocabParallelEmbedding, +) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name, @@ -532,12 +535,20 @@ class Gemma3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): super().__init__() self.config = config - # currently all existing Gemma models have `tie_word_embeddings` enabled - assert config.tie_word_embeddings self.quant_config = quant_config self.model = Gemma3Model( vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") ) + + self.lm_head = ParallelLMHead( + config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head"), + ) + if config.tie_word_embeddings: + self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens) + self.logits_processor = LogitsProcessor( config.vocab_size, soft_cap=config.final_logit_softcapping ) @@ -565,7 +576,7 @@ class Gemma3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): self, hidden_states: torch.Tensor, ) -> torch.Tensor | None: - logits = self.logits_processor(self.model.embed_tokens, hidden_states) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: From 2497228ad4427310bc55427f6db404a00de4fd78 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 17 Dec 2025 23:32:17 +0800 Subject: [PATCH 028/176] [Chore] Factor out logic for requesting initial memory (#30868) Signed-off-by: DarkLight1337 --- vllm/utils/mem_utils.py | 31 +++++++++++++++++++++++++++---- vllm/v1/worker/gpu_worker.py | 20 ++++---------------- vllm/v1/worker/utils.py | 26 +++++++++++++++++++++++++- 3 files changed, 56 insertions(+), 21 deletions(-) diff --git a/vllm/utils/mem_utils.py b/vllm/utils/mem_utils.py index e2517b935bf28..bf6d7846573b9 100644 --- a/vllm/utils/mem_utils.py +++ b/vllm/utils/mem_utils.py @@ -66,27 +66,43 @@ class MemorySnapshot: torch_memory: int = 0 non_torch_memory: int = 0 timestamp: float = 0.0 + + device: torch.types.Device = None auto_measure: bool = True def __post_init__(self) -> None: + if self.device is None: + from vllm.platforms import current_platform + + device_fn = current_platform.current_device + assert device_fn is not None + self.device_ = torch.device(device_fn()) + else: + self.device_ = torch.device(self.device) + if self.auto_measure: self.measure() def measure(self) -> None: from vllm.platforms import current_platform + device = self.device_ + # we measure the torch peak memory usage via allocated_bytes, # rather than `torch.cuda.memory_reserved()` . # After `torch.cuda.reset_peak_memory_stats()`, # `torch.cuda.memory_reserved()` will keep growing, and only shrink # when we call `torch.cuda.empty_cache()` or OOM happens. - self.torch_peak = torch.cuda.memory_stats().get("allocated_bytes.all.peak", 0) + self.torch_peak = torch.cuda.memory_stats(device).get( + "allocated_bytes.all.peak", 0 + ) - self.free_memory, self.total_memory = torch.cuda.mem_get_info() + self.free_memory, self.total_memory = torch.cuda.mem_get_info(device) shared_sysmem_device_mem_sms = ((8, 7), (11, 0), (12, 1)) # Orin, Thor, Spark if ( current_platform.is_cuda() - and current_platform.get_device_capability() in shared_sysmem_device_mem_sms + and current_platform.get_device_capability(device.index) + in shared_sysmem_device_mem_sms ): # On UMA (Orin, Thor and Spark) platform, # where both CPU and GPU rely on system memory, @@ -106,12 +122,18 @@ class MemorySnapshot: # torch.cuda.memory_reserved() is how many bytes # PyTorch gets from cuda (by calling cudaMalloc, etc.) # this is used to measure the non-torch memory usage - self.torch_memory = torch.cuda.memory_reserved() + self.torch_memory = torch.cuda.memory_reserved(device) self.non_torch_memory = self.cuda_memory - self.torch_memory self.timestamp = time.time() def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot": + if self.device_ != other.device_: + raise ValueError( + "The two snapshots should be from the same device! " + f"Found: {self.device_} vs. {other.device_}" + ) + return MemorySnapshot( torch_peak=self.torch_peak - other.torch_peak, free_memory=self.free_memory - other.free_memory, @@ -120,6 +142,7 @@ class MemorySnapshot: torch_memory=self.torch_memory - other.torch_memory, non_torch_memory=self.non_torch_memory - other.non_torch_memory, timestamp=self.timestamp - other.timestamp, + device=self.device_, auto_measure=False, ) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 1e13650cd083e..bc71351d2cc55 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -56,6 +56,8 @@ from vllm.v1.worker.utils import is_residual_scattered_for_sp from vllm.v1.worker.worker_base import WorkerBase from vllm.v1.worker.workspace import init_workspace_manager +from .utils import request_memory + logger = init_logger(__name__) if TYPE_CHECKING: @@ -237,22 +239,8 @@ class Worker(WorkerBase): torch.cuda.empty_cache() # take current memory snapshot - self.init_snapshot = MemorySnapshot() - self.requested_memory = ( - self.init_snapshot.total_memory - * self.cache_config.gpu_memory_utilization - ) - if self.init_snapshot.free_memory < self.requested_memory: - GiB = lambda b: round(b / GiB_bytes, 2) - raise ValueError( - f"Free memory on device " - f"({GiB(self.init_snapshot.free_memory)}/" - f"{GiB(self.init_snapshot.total_memory)} GiB) on startup " - f"is less than desired GPU memory utilization " - f"({self.cache_config.gpu_memory_utilization}, " - f"{GiB(self.requested_memory)} GiB). Decrease GPU memory " - f"utilization or reduce GPU memory used by other processes." - ) + self.init_snapshot = init_snapshot = MemorySnapshot(device=self.device) + self.requested_memory = request_memory(init_snapshot, self.cache_config) else: raise RuntimeError(f"Not support device type: {self.device_config.device}") diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 2e8afec024ce9..31ccf7f157468 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -8,13 +8,15 @@ from typing_extensions import deprecated from vllm.attention.backends.abstract import AttentionBackend from vllm.attention.layer import Attention -from vllm.config import ModelConfig, SchedulerConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig from vllm.logger import init_logger from vllm.model_executor.models.interfaces import MultiModalEmbeddings from vllm.model_executor.models.utils import extract_layer_index from vllm.multimodal.cache import processor_only_cache_from_config from vllm.multimodal.registry import MultiModalRegistry from vllm.platforms import current_platform +from vllm.utils.mem_constants import GiB_bytes +from vllm.utils.mem_utils import MemorySnapshot from vllm.v1.attention.backends.utils import AttentionMetadataBuilder from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget from vllm.v1.kv_cache_interface import KVCacheGroupSpec, KVCacheSpec @@ -248,6 +250,28 @@ def gather_mm_placeholders( return placeholders[is_embed] +def request_memory(init_snapshot: MemorySnapshot, cache_config: CacheConfig) -> float: + """ + Calculate the amount of memory required by vLLM, then validate + that the current amount of free memory is sufficient for that. + """ + requested_memory = init_snapshot.total_memory * cache_config.gpu_memory_utilization + + if init_snapshot.free_memory < requested_memory: + GiB = lambda b: round(b / GiB_bytes, 2) + raise ValueError( + f"Free memory on device {init_snapshot.device_} " + f"({GiB(init_snapshot.free_memory)}/" + f"{GiB(init_snapshot.total_memory)} GiB) on startup " + f"is less than desired GPU memory utilization " + f"({cache_config.gpu_memory_utilization}, " + f"{GiB(requested_memory)} GiB). Decrease GPU memory " + f"utilization or reduce GPU memory used by other processes." + ) + + return requested_memory + + def add_kv_sharing_layers_to_kv_cache_groups( shared_kv_cache_layers: dict[str, str], kv_cache_groups: list[KVCacheGroupSpec], From 9ca8cb38fd68142627c9649756f1ddc5432c8b19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Wed, 17 Dec 2025 18:49:56 +0100 Subject: [PATCH 029/176] [CI][Bugfix] Fix flaky `tests/entrypoints/openai/test_audio.py::test_chat_streaming_audio` (#30878) Signed-off-by: NickLucche --- tests/entrypoints/openai/test_audio.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py index a2d8993441fcd..4cf864bdb2de9 100644 --- a/tests/entrypoints/openai/test_audio.py +++ b/tests/entrypoints/openai/test_audio.py @@ -254,7 +254,9 @@ async def test_single_chat_session_input_audio( async def test_chat_streaming_audio( client: openai.AsyncOpenAI, model_name: str, audio_url: str ): - messages = dummy_messages_from_audio_url(audio_url) + messages = dummy_messages_from_audio_url( + audio_url, "What's a short title for this audio?" + ) # test single completion chat_completion = await client.chat.completions.create( From 7eb6cb6c18a948fb49824154cb3ece1e32d12cf8 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Wed, 17 Dec 2025 12:49:59 -0500 Subject: [PATCH 030/176] [Attention] Update tests to remove deprecated env vars (#30563) Signed-off-by: Matthew Bonanni --- .../scripts/hardware_ci/run-xpu-test.sh | 2 +- .../test_basic_correctness.py | 85 +++++------ tests/compile/distributed/test_fusions_e2e.py | 9 +- .../fullgraph/test_basic_correctness.py | 82 ++++++----- .../compile/fullgraph/test_full_cudagraph.py | 13 +- tests/compile/fullgraph/test_full_graph.py | 7 +- tests/distributed/test_context_parallel.py | 4 +- tests/distributed/test_pp_cudagraph.py | 26 ++-- tests/engine/test_arg_utils.py | 135 +++++++++++++++++- tests/entrypoints/openai/test_serving_chat.py | 13 +- .../attention/test_attention_selector.py | 52 +++---- .../attention/test_rocm_attention_selector.py | 60 +++++--- tests/kernels/test_flex_attention.py | 95 ++++++------ .../generation/test_granite_speech.py | 12 +- tests/models/multimodal/pooling/conftest.py | 24 ++-- .../models/multimodal/pooling/test_siglip.py | 8 ++ tests/models/quantization/test_fp8.py | 3 +- tests/models/test_initialization.py | 12 +- .../test_rocm_attention_backends_selection.py | 12 +- tests/v1/attention/utils.py | 47 +++--- tests/v1/cudagraph/test_cudagraph_mode.py | 33 +---- tests/v1/determinism/test_batch_invariance.py | 25 ++-- .../test_online_batch_invariance.py | 5 +- tests/v1/e2e/test_async_scheduling.py | 22 +-- tests/v1/e2e/test_cascade_attention.py | 29 ++-- tests/v1/e2e/test_spec_decode.py | 43 +++--- .../nixl_integration/run_accuracy_test.sh | 22 ++- .../tp_config_sweep_accuracy_test.sh | 12 +- .../kv_connector/unit/test_nixl_connector.py | 6 +- tests/v1/kv_connector/unit/utils.py | 4 + tests/v1/kv_offload/test_cpu_offloading.py | 15 +- tests/v1/spec_decode/test_eagle.py | 19 ++- tests/v1/spec_decode/test_max_len.py | 89 ++++++------ vllm/v1/attention/backends/rocm_attn.py | 2 +- 34 files changed, 580 insertions(+), 447 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh index dfc9db512d1e9..85b554e5e8646 100644 --- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh @@ -39,7 +39,7 @@ docker run \ python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager - VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager + python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN cd tests pytest -v -s v1/core pytest -v -s v1/engine diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 9e1cc309edd1d..68b5cd5101d5d 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -67,7 +67,6 @@ def _fix_prompt_embed_outputs( @pytest.mark.parametrize("model_executor", ["uni", "mp"]) @pytest.mark.parametrize("enable_prompt_embeds", [True, False]) def test_models( - monkeypatch: pytest.MonkeyPatch, hf_runner, model: str, backend: str, @@ -77,48 +76,46 @@ def test_models( model_executor: str, enable_prompt_embeds: bool, ) -> None: - with monkeypatch.context() as m: - m.setenv("VLLM_ATTENTION_BACKEND", backend) + # 5042 tokens for gemma2 + # gemma2 has alternating sliding window size of 4096 + # we need a prompt with more than 4096 tokens to test the sliding window + prompt = ( + "The following numbers of the sequence " + + ", ".join(str(i) for i in range(1024)) + + " are:" + ) + example_prompts = [prompt] - # 5042 tokens for gemma2 - # gemma2 has alternating sliding window size of 4096 - # we need a prompt with more than 4096 tokens to test the sliding window - prompt = ( - "The following numbers of the sequence " - + ", ".join(str(i) for i in range(1024)) - + " are:" - ) - example_prompts = [prompt] + with hf_runner(model) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) + if enable_prompt_embeds: + with torch.no_grad(): + prompt_embeds = hf_model.get_prompt_embeddings(example_prompts) - with hf_runner(model) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - if enable_prompt_embeds: - with torch.no_grad(): - prompt_embeds = hf_model.get_prompt_embeddings(example_prompts) + with VllmRunner( + model, + max_model_len=8192, + enforce_eager=enforce_eager, + enable_prompt_embeds=enable_prompt_embeds, + gpu_memory_utilization=0.7, + async_scheduling=async_scheduling, + distributed_executor_backend=model_executor, + attention_config={"backend": backend}, + ) as vllm_model: + if enable_prompt_embeds: + vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens) + vllm_outputs = _fix_prompt_embed_outputs( + vllm_outputs, hf_model, example_prompts + ) + else: + vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - with VllmRunner( - model, - max_model_len=8192, - enforce_eager=enforce_eager, - enable_prompt_embeds=enable_prompt_embeds, - gpu_memory_utilization=0.7, - async_scheduling=async_scheduling, - distributed_executor_backend=model_executor, - ) as vllm_model: - if enable_prompt_embeds: - vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens) - vllm_outputs = _fix_prompt_embed_outputs( - vllm_outputs, hf_model, example_prompts - ) - else: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) + check_outputs_equal( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) @multi_gpu_test(num_gpus=2) @@ -161,12 +158,6 @@ def test_models_distributed( ): # noqa pytest.skip("enable_prompt_embeds does not work with ray compiled dag.") - if attention_backend: - monkeypatch_context.setenv( - "VLLM_ATTENTION_BACKEND", - attention_backend, - ) - for k, v in extra_env.items(): monkeypatch_context.setenv(k, v) @@ -178,6 +169,7 @@ def test_models_distributed( # if we run HF first, the cuda initialization will be done and it # will hurt multiprocessing backend with fork method # (the default method). + attention_config = {"backend": attention_backend} if attention_backend else None with vllm_runner( model, dtype=dtype, @@ -185,6 +177,7 @@ def test_models_distributed( distributed_executor_backend=distributed_executor_backend, enable_prompt_embeds=enable_prompt_embeds, gpu_memory_utilization=0.7, + attention_config=attention_config, ) as vllm_model: if enable_prompt_embeds: with hf_runner(model, dtype=dtype) as hf_model: diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py index 960b5b4bd7ad4..28ab2cee71a6a 100644 --- a/tests/compile/distributed/test_fusions_e2e.py +++ b/tests/compile/distributed/test_fusions_e2e.py @@ -208,7 +208,8 @@ def test_attn_quant( # To capture subprocess logs, we need to know whether spawn or fork is used. # Force spawn as it is more general. monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name) + + model_kwargs["attention_config"] = {"backend": backend.name} compilation_config = CompilationConfig( # Testing properties @@ -297,7 +298,8 @@ def test_tp2_attn_quant_allreduce_rmsnorm( # To capture subprocess logs, we need to know whether spawn or fork is used. # Force spawn as it is more general. monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name) + + model_kwargs["attention_config"] = {"backend": backend.name} compilation_config = CompilationConfig( # Testing properties @@ -409,7 +411,8 @@ def test_tp2_attn_quant_async_tp( # To capture subprocess logs, we need to know whether spawn or fork is used. # Force spawn as it is more general. monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name) + + model_kwargs["attention_config"] = {"backend": backend.name} compilation_config = CompilationConfig( # Testing properties diff --git a/tests/compile/fullgraph/test_basic_correctness.py b/tests/compile/fullgraph/test_basic_correctness.py index f2e58b5cc423e..d062ed221ff59 100644 --- a/tests/compile/fullgraph/test_basic_correctness.py +++ b/tests/compile/fullgraph/test_basic_correctness.py @@ -89,7 +89,6 @@ class TestSetting: ], ) def test_compile_correctness( - monkeypatch: pytest.MonkeyPatch, test_setting: TestSetting, ): # this test is run under multiple suits, with different GPUs. @@ -107,49 +106,48 @@ def test_compile_correctness( f"{cuda_device_count_stateless()}" ) - with monkeypatch.context() as m: - m.setenv("VLLM_ATTENTION_BACKEND", attn_backend) - final_args = [ - *model_args, - "-pp", - str(pp_size), - "-tp", - str(tp_size), - "-cc.cudagraph_mode=none", - ] + final_args = [ + *model_args, + "-pp", + str(pp_size), + "-tp", + str(tp_size), + "-cc.cudagraph_mode=none", + f"--attention-backend={attn_backend}", + ] - all_args: list[list[str]] = [] - all_envs: list[dict[str, str] | None] = [] + all_args: list[list[str]] = [] + all_envs: list[dict[str, str] | None] = [] - for comp_mode in [ - CompilationMode.STOCK_TORCH_COMPILE, - CompilationMode.DYNAMO_TRACE_ONCE, - CompilationMode.VLLM_COMPILE, - ]: - for mode in [CompilationMode.NONE, comp_mode]: - all_args.append( - final_args + [f"-cc.mode={mode.name}", "-cc.backend=inductor"] - ) - - # inductor will change the output, so we only compare if the output - # is close, not exactly the same. - compare_all_settings( - model, - all_args, - all_envs, - method=method if method != "generate" else "generate_close", + for comp_mode in [ + CompilationMode.STOCK_TORCH_COMPILE, + CompilationMode.DYNAMO_TRACE_ONCE, + CompilationMode.VLLM_COMPILE, + ]: + for mode in [CompilationMode.NONE, comp_mode]: + all_args.append( + final_args + [f"-cc.mode={mode.name}", "-cc.backend=inductor"] ) - all_envs.clear() - all_args.clear() - for mode in [ - CompilationMode.NONE, - CompilationMode.STOCK_TORCH_COMPILE, - CompilationMode.DYNAMO_TRACE_ONCE, - CompilationMode.VLLM_COMPILE, - ]: - all_args.append(final_args + [f"-cc.mode={mode.name}", "-cc.backend=eager"]) - all_envs.append({}) - all_envs.append({}) + # inductor will change the output, so we only compare if the output + # is close, not exactly the same. + compare_all_settings( + model, + all_args, + all_envs, + method=method if method != "generate" else "generate_close", + ) + all_envs.clear() + all_args.clear() - compare_all_settings(model, all_args * 3, all_envs, method=method) + for mode in [ + CompilationMode.NONE, + CompilationMode.STOCK_TORCH_COMPILE, + CompilationMode.DYNAMO_TRACE_ONCE, + CompilationMode.VLLM_COMPILE, + ]: + all_args.append(final_args + [f"-cc.mode={mode.name}", "-cc.backend=eager"]) + all_envs.append({}) + all_envs.append({}) + + compare_all_settings(model, all_args * 3, all_envs, method=method) diff --git a/tests/compile/fullgraph/test_full_cudagraph.py b/tests/compile/fullgraph/test_full_cudagraph.py index c6d4b5272dbcf..4ce6abfe3e46d 100644 --- a/tests/compile/fullgraph/test_full_cudagraph.py +++ b/tests/compile/fullgraph/test_full_cudagraph.py @@ -74,7 +74,6 @@ def llm_pair(request): # Force native sampler to avoid potential nondeterminism in FlashInfer # when per-request generators are not used in V1. "VLLM_USE_FLASHINFER_SAMPLER": "0", - **backend_config.env_vars, } with temporary_environ(env_vars): full = LLM( @@ -170,16 +169,10 @@ class TestFullCUDAGraph: @pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda") def test_full_cudagraph_with_invalid_backend(): - with ( - temporary_environ( - { - "VLLM_ATTENTION_BACKEND": "FLEX_ATTENTION", - # Flex_Attention is not supported with full cuda graph - } - ), - pytest.raises(RuntimeError), - ): + # Flex_Attention is not supported with full cuda graph + with pytest.raises(RuntimeError): LLM( model="Qwen/Qwen2-1.5B-Instruct", compilation_config=CompilationConfig(cudagraph_mode="FULL"), + attention_config={"backend": "FLEX_ATTENTION"}, ) diff --git a/tests/compile/fullgraph/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py index 3cd1d4be2ebdc..22af2d57f4f3d 100644 --- a/tests/compile/fullgraph/test_full_graph.py +++ b/tests/compile/fullgraph/test_full_graph.py @@ -197,20 +197,19 @@ def test_custom_compile_config( ], ) def test_fp8_kv_scale_compile( - monkeypatch: pytest.MonkeyPatch, compilation_mode: int, model: str, backend: AttentionBackendEnum | None, ): - if backend: - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name) - model_kwargs = { "quantization": "fp8", "kv_cache_dtype": "fp8_e4m3", "calculate_kv_scales": True, "max_model_len": 512, } + if backend: + model_kwargs["attention_config"] = {"backend": backend.name} + run_model(compilation_mode, model, **model_kwargs) diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py index aa47f28a34dd5..a286309217719 100644 --- a/tests/distributed/test_context_parallel.py +++ b/tests/distributed/test_context_parallel.py @@ -219,14 +219,12 @@ def _test_cp_gsm8k( ] ) - server_env = {} if attn_backend: - server_env["VLLM_ATTENTION_BACKEND"] = attn_backend + server_args.append(f"--attention-backend={attn_backend}") with RemoteOpenAIServer( model_id, server_args, - env_dict=server_env, max_wait_seconds=720, ) as remote_server: host = f"http://{remote_server.host}" diff --git a/tests/distributed/test_pp_cudagraph.py b/tests/distributed/test_pp_cudagraph.py index 2f2b43cb4cc2b..34ae305c2d2c1 100644 --- a/tests/distributed/test_pp_cudagraph.py +++ b/tests/distributed/test_pp_cudagraph.py @@ -20,23 +20,21 @@ from ..utils import compare_two_settings, create_new_process_for_each_test ) @create_new_process_for_each_test() def test_pp_cudagraph( - monkeypatch: pytest.MonkeyPatch, PP_SIZE: int, MODEL_NAME: str, ATTN_BACKEND: LiteralString, ): - with monkeypatch.context() as m: - cudagraph_args = [ - # use half precision for speed and memory savings in CI environment - "--dtype", - "float16", - "--pipeline-parallel-size", - str(PP_SIZE), - "--distributed-executor-backend", - "mp", - ] - m.setenv("VLLM_ATTENTION_BACKEND", ATTN_BACKEND) + cudagraph_args = [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "float16", + "--pipeline-parallel-size", + str(PP_SIZE), + "--distributed-executor-backend", + "mp", + f"--attention-backend={ATTN_BACKEND}", + ] - eager_args = cudagraph_args + ["--enforce-eager"] + eager_args = cudagraph_args + ["--enforce-eager"] - compare_two_settings(MODEL_NAME, eager_args, cudagraph_args) + compare_two_settings(MODEL_NAME, eager_args, cudagraph_args) diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index c2cf77ffa12b6..25a5e00cc0e16 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -9,7 +9,7 @@ from typing import Annotated, Literal import pytest -from vllm.config import CompilationConfig, config +from vllm.config import AttentionConfig, CompilationConfig, config from vllm.engine.arg_utils import ( EngineArgs, contains_type, @@ -298,6 +298,139 @@ def test_compilation_config(): ) +def test_attention_config(): + from vllm.attention.backends.registry import AttentionBackendEnum + + parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) + + # default value + args = parser.parse_args([]) + assert args is not None + engine_args = EngineArgs.from_cli_args(args) + assert engine_args.attention_config == AttentionConfig() + + # set backend via dot notation + args = parser.parse_args(["--attention-config.backend", "FLASH_ATTN"]) + assert args is not None + engine_args = EngineArgs.from_cli_args(args) + assert engine_args.attention_config.backend is not None + assert engine_args.attention_config.backend.name == "FLASH_ATTN" + + # set backend via --attention-backend shorthand + args = parser.parse_args(["--attention-backend", "FLASHINFER"]) + assert args is not None + engine_args = EngineArgs.from_cli_args(args) + assert engine_args.attention_backend is not None + assert engine_args.attention_backend == "FLASHINFER" + + # set all fields via dot notation + args = parser.parse_args( + [ + "--attention-config.backend", + "FLASH_ATTN", + "--attention-config.flash_attn_version", + "3", + "--attention-config.use_prefill_decode_attention", + "true", + "--attention-config.flash_attn_max_num_splits_for_cuda_graph", + "16", + "--attention-config.use_cudnn_prefill", + "true", + "--attention-config.use_trtllm_ragged_deepseek_prefill", + "true", + "--attention-config.use_trtllm_attention", + "true", + "--attention-config.disable_flashinfer_prefill", + "true", + "--attention-config.disable_flashinfer_q_quantization", + "true", + ] + ) + assert args is not None + engine_args = EngineArgs.from_cli_args(args) + assert engine_args.attention_config.backend is not None + assert engine_args.attention_config.backend.name == "FLASH_ATTN" + assert engine_args.attention_config.flash_attn_version == 3 + assert engine_args.attention_config.use_prefill_decode_attention is True + assert engine_args.attention_config.flash_attn_max_num_splits_for_cuda_graph == 16 + assert engine_args.attention_config.use_cudnn_prefill is True + assert engine_args.attention_config.use_trtllm_ragged_deepseek_prefill is True + assert engine_args.attention_config.use_trtllm_attention is True + assert engine_args.attention_config.disable_flashinfer_prefill is True + assert engine_args.attention_config.disable_flashinfer_q_quantization is True + + # set to string form of a dict with all fields + args = parser.parse_args( + [ + "--attention-config=" + '{"backend": "FLASHINFER", "flash_attn_version": 2, ' + '"use_prefill_decode_attention": false, ' + '"flash_attn_max_num_splits_for_cuda_graph": 8, ' + '"use_cudnn_prefill": false, ' + '"use_trtllm_ragged_deepseek_prefill": false, ' + '"use_trtllm_attention": false, ' + '"disable_flashinfer_prefill": false, ' + '"disable_flashinfer_q_quantization": false}', + ] + ) + assert args is not None + engine_args = EngineArgs.from_cli_args(args) + assert engine_args.attention_config.backend is not None + assert engine_args.attention_config.backend.name == "FLASHINFER" + assert engine_args.attention_config.flash_attn_version == 2 + assert engine_args.attention_config.use_prefill_decode_attention is False + assert engine_args.attention_config.flash_attn_max_num_splits_for_cuda_graph == 8 + assert engine_args.attention_config.use_cudnn_prefill is False + assert engine_args.attention_config.use_trtllm_ragged_deepseek_prefill is False + assert engine_args.attention_config.use_trtllm_attention is False + assert engine_args.attention_config.disable_flashinfer_prefill is False + assert engine_args.attention_config.disable_flashinfer_q_quantization is False + + # test --attention-backend flows into VllmConfig.attention_config + args = parser.parse_args( + [ + "--model", + "facebook/opt-125m", + "--attention-backend", + "FLASH_ATTN", + ] + ) + assert args is not None + engine_args = EngineArgs.from_cli_args(args) + vllm_config = engine_args.create_engine_config() + assert vllm_config.attention_config.backend == AttentionBackendEnum.FLASH_ATTN + + # test --attention-config.backend flows into VllmConfig.attention_config + args = parser.parse_args( + [ + "--model", + "facebook/opt-125m", + "--attention-config.backend", + "FLASHINFER", + ] + ) + assert args is not None + engine_args = EngineArgs.from_cli_args(args) + vllm_config = engine_args.create_engine_config() + assert vllm_config.attention_config.backend == AttentionBackendEnum.FLASHINFER + + # test --attention-backend and --attention-config.backend are mutually exclusive + args = parser.parse_args( + [ + "--model", + "facebook/opt-125m", + "--attention-backend", + "FLASH_ATTN", + "--attention-config.backend", + "FLASHINFER", + ] + ) + assert args is not None + engine_args = EngineArgs.from_cli_args(args) + with pytest.raises(ValueError, match="mutually exclusive"): + engine_args.create_engine_config() + + def test_prefix_cache_default(): parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) args = parser.parse_args([]) diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 444275e061c61..2befa40d636da 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -76,15 +76,10 @@ def default_server_args(with_tool_parser: bool): @pytest.fixture(scope="module") -def gptoss_server( - monkeypatch_module: pytest.MonkeyPatch, default_server_args: list[str] -): - with monkeypatch_module.context() as m: - m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN") - with RemoteOpenAIServer( - GPT_OSS_MODEL_NAME, default_server_args - ) as remote_server: - yield remote_server +def gptoss_server(default_server_args: list[str]): + server_args = default_server_args + ["--attention-backend=TRITON_ATTN"] + with RemoteOpenAIServer(GPT_OSS_MODEL_NAME, server_args) as remote_server: + yield remote_server @pytest_asyncio.fixture diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py index c959b2f4bb03c..d62acc2022d10 100644 --- a/tests/kernels/attention/test_attention_selector.py +++ b/tests/kernels/attention/test_attention_selector.py @@ -6,7 +6,9 @@ from unittest.mock import patch import pytest import torch +from vllm.attention.backends.registry import AttentionBackendEnum from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend +from vllm.config import AttentionConfig, VllmConfig, set_current_vllm_config from vllm.platforms import current_platform from vllm.platforms.cpu import CpuPlatform from vllm.platforms.cuda import CudaPlatform @@ -73,18 +75,18 @@ def generate_params(): @pytest.mark.parametrize("device, name, use_mla, block_size", generate_params()) -def test_env( +def test_backend_selection( device: str, name: str, use_mla: bool, block_size: int, - monkeypatch: pytest.MonkeyPatch, ): """Test attention backend selection with valid device-backend pairs.""" - with monkeypatch.context() as m: - m.setenv("VLLM_ATTENTION_BACKEND", name) - m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0") + # Create AttentionConfig with the specified backend + attention_config = AttentionConfig(backend=AttentionBackendEnum[name]) + vllm_config = VllmConfig(attention_config=attention_config) + with set_current_vllm_config(vllm_config): if device == "cpu": with patch("vllm.platforms.current_platform", CpuPlatform()): backend = get_attn_backend(16, torch.float16, None, block_size) @@ -217,27 +219,32 @@ def test_env( @pytest.mark.parametrize("device", ["cpu", "cuda"]) def test_fp32_fallback(device: str): """Test attention backend selection with fp32.""" - if device == "cpu": - with patch("vllm.platforms.current_platform", CpuPlatform()): - backend = get_attn_backend(16, torch.float32, None, 16) - assert backend.get_name() == "CPU_ATTN" + # Use default config (no backend specified) + vllm_config = VllmConfig() - elif device == "cuda": - with patch("vllm.platforms.current_platform", CudaPlatform()): - backend = get_attn_backend(16, torch.float32, None, 16) - assert backend.get_name() == "FLEX_ATTENTION" + with set_current_vllm_config(vllm_config): + if device == "cpu": + with patch("vllm.platforms.current_platform", CpuPlatform()): + backend = get_attn_backend(16, torch.float32, None, 16) + assert backend.get_name() == "CPU_ATTN" + + elif device == "cuda": + with patch("vllm.platforms.current_platform", CudaPlatform()): + backend = get_attn_backend(16, torch.float32, None, 16) + assert backend.get_name() == "FLEX_ATTENTION" def test_flash_attn(monkeypatch: pytest.MonkeyPatch): """Test FlashAttn validation.""" pytest.skip( "Skipping as current backend selector does not " - "handle fallbacks when a backend is set via env var." + "handle fallbacks when a backend is explicitly set." ) - with monkeypatch.context() as m: - m.setenv("VLLM_ATTENTION_BACKEND", "FLASH_ATTN") + attention_config = AttentionConfig(backend=AttentionBackendEnum.FLASH_ATTN) + vllm_config = VllmConfig(attention_config=attention_config) + with set_current_vllm_config(vllm_config): # Unsupported CUDA arch monkeypatch.setattr(torch.cuda, "get_device_capability", lambda _=None: (7, 5)) backend = get_attn_backend(16, torch.float16, None, 16) @@ -277,15 +284,10 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch): assert backend.get_name() != "FLASH_ATTN" -def test_invalid_env(monkeypatch: pytest.MonkeyPatch): +def test_invalid_backend(): """Test that invalid attention backend names raise ValueError.""" with ( - monkeypatch.context() as m, - patch("vllm.platforms.current_platform", CudaPlatform()), + pytest.raises(ValueError), ): - m.setenv("VLLM_ATTENTION_BACKEND", "INVALID") - - # Should raise ValueError for invalid backend - with pytest.raises(ValueError) as exc_info: - get_attn_backend(32, torch.float16, None, 16) - assert "Invalid value 'INVALID'" in str(exc_info.value) + # Invalid backend name should raise ValueError when creating enum + AttentionConfig(backend=AttentionBackendEnum["INVALID"]) diff --git a/tests/kernels/attention/test_rocm_attention_selector.py b/tests/kernels/attention/test_rocm_attention_selector.py index b61058081c0b2..f97d475eb47d7 100644 --- a/tests/kernels/attention/test_rocm_attention_selector.py +++ b/tests/kernels/attention/test_rocm_attention_selector.py @@ -4,7 +4,9 @@ import pytest import torch +from vllm.attention.backends.registry import AttentionBackendEnum from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend +from vllm.config import AttentionConfig, VllmConfig, set_current_vllm_config from vllm.platforms.rocm import RocmPlatform @@ -16,40 +18,56 @@ def clear_cache(): @pytest.mark.skip(reason="Skipped for now. Should be revisited.") def test_selector(monkeypatch: pytest.MonkeyPatch): - with monkeypatch.context() as m: - m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_ATTN") + # Set the current platform to ROCm using monkeypatch + monkeypatch.setattr("vllm.attention.selector.current_platform", RocmPlatform()) - # Set the current platform to ROCm using monkeypatch - monkeypatch.setattr("vllm.attention.selector.current_platform", RocmPlatform()) + # Test standard ROCm attention + attention_config = AttentionConfig(backend=AttentionBackendEnum.ROCM_ATTN) + vllm_config = VllmConfig(attention_config=attention_config) - # Test standard ROCm attention + with set_current_vllm_config(vllm_config): backend = get_attn_backend(16, torch.float16, torch.float16, 16, False) assert backend.get_name() == "ROCM_FLASH" or backend.get_name() == "TRITON_ATTN" - # MLA test for deepseek related + # MLA test for deepseek related + # Change the attention backend to triton MLA + attention_config = AttentionConfig(backend=AttentionBackendEnum.TRITON_MLA) + vllm_config = VllmConfig(attention_config=attention_config) - # change the attention backend to triton MLA - m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_MLA") + with set_current_vllm_config(vllm_config): backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, use_mla=True) assert backend.get_name() == "TRITON_MLA" - # If attention backend is None - # If use_mla is true - # The selected backend is triton MLA - m.setenv("VLLM_ATTENTION_BACKEND", "") + # If attention backend is None + # If use_mla is true + # The selected backend is triton MLA + attention_config = AttentionConfig(backend=None) + vllm_config = VllmConfig(attention_config=attention_config) + + with set_current_vllm_config(vllm_config): backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, use_mla=True) assert backend.get_name() == "TRITON_MLA" - # change the attention backend to AITER MLA - m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_AITER_MLA") + # Change the attention backend to AITER MLA + attention_config = AttentionConfig(backend=AttentionBackendEnum.ROCM_AITER_MLA) + vllm_config = VllmConfig(attention_config=attention_config) + + with set_current_vllm_config(vllm_config): backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False, use_mla=True) assert backend.get_name() == "ROCM_AITER_MLA" - # If attention backend is None - # If use_mla is true - # If VLLM_ROCM_USE_AITER is enabled - # The selected backend is ROCM_AITER_MLA - m.setenv("VLLM_ATTENTION_BACKEND", "") + # If attention backend is None + # If use_mla is true + # If VLLM_ROCM_USE_AITER is enabled + # The selected backend is ROCM_AITER_MLA + with monkeypatch.context() as m: m.setenv("VLLM_ROCM_USE_AITER", "1") - backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False, use_mla=True) - assert backend.get_name() == "ROCM_AITER_MLA" + + attention_config = AttentionConfig(backend=None) + vllm_config = VllmConfig(attention_config=attention_config) + + with set_current_vllm_config(vllm_config): + backend = get_attn_backend( + 576, torch.bfloat16, "auto", 1, False, use_mla=True + ) + assert backend.get_name() == "ROCM_AITER_MLA" diff --git a/tests/kernels/test_flex_attention.py b/tests/kernels/test_flex_attention.py index ae33f422d3732..f6987d54399d2 100644 --- a/tests/kernels/test_flex_attention.py +++ b/tests/kernels/test_flex_attention.py @@ -37,7 +37,7 @@ def set_seed(seed): not torch.cuda.is_available() or TORCH_VERSION < MINIMUM_TORCH_VERSION, reason="CUDA not available or PyTorch version < 2.7", ) -def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch): +def test_flex_attention_vs_default_backend(vllm_runner): """Test that FlexAttention produces the same outputs as the default backend. This test compares the outputs from the FlexAttention backend with @@ -54,35 +54,32 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch): ] # Run with flex attention - with monkeypatch.context() as m: - m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION") - - set_seed(seed) - with vllm_runner( - model_name, - runner="generate", - tensor_parallel_size=1, - num_gpu_blocks_override=128, - enforce_eager=True, - ) as llm_flex: - output_flex = llm_flex.generate_greedy_logprobs( - prompts, max_tokens, num_logprobs - ) + set_seed(seed) + with vllm_runner( + model_name, + runner="generate", + tensor_parallel_size=1, + num_gpu_blocks_override=128, + enforce_eager=True, + attention_config={"backend": "FLEX_ATTENTION"}, + ) as llm_flex: + output_flex = llm_flex.generate_greedy_logprobs( + prompts, max_tokens, num_logprobs + ) # Run with default backend - with monkeypatch.context() as m: - set_seed(seed) - with vllm_runner( - model_name, - runner="generate", - tensor_parallel_size=1, - num_gpu_blocks_override=128, - enforce_eager=True, - gpu_memory_utilization=0.85, - ) as llm_default: - output_default = llm_default.generate_greedy_logprobs( - prompts, max_tokens, num_logprobs - ) + set_seed(seed) + with vllm_runner( + model_name, + runner="generate", + tensor_parallel_size=1, + num_gpu_blocks_override=128, + enforce_eager=True, + gpu_memory_utilization=0.85, + ) as llm_default: + output_default = llm_default.generate_greedy_logprobs( + prompts, max_tokens, num_logprobs + ) check_logprobs_close( outputs_0_lst=output_flex, @@ -96,7 +93,7 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch): not torch.cuda.is_available() or TORCH_VERSION < MINIMUM_TORCH_VERSION, reason="CUDA not available or PyTorch version < 2.7", ) -def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch): +def test_encoder_flex_attention_vs_default_backend(vllm_runner): """Test that FlexAttention produces the same outputs as the default backend. This test compares the outputs from the FlexAttention backend with @@ -110,30 +107,26 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch): ] # Run with flex attention - with monkeypatch.context() as m: - m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION") - with vllm_runner( - model_name, - runner="pooling", - dtype=torch.bfloat16, - tensor_parallel_size=1, - max_model_len=100, - enforce_eager=True, - ) as llm_flex: - flex_outputs = llm_flex.embed(prompts) + with vllm_runner( + model_name, + runner="pooling", + dtype=torch.bfloat16, + tensor_parallel_size=1, + max_model_len=100, + enforce_eager=True, + attention_config={"backend": "FLEX_ATTENTION"}, + ) as llm_flex: + flex_outputs = llm_flex.embed(prompts) # Run with default backend - with ( - monkeypatch.context() as m, - vllm_runner( - model_name, - runner="pooling", - dtype=torch.bfloat16, - tensor_parallel_size=1, - max_model_len=100, - enforce_eager=True, - ) as llm_default, - ): + with vllm_runner( + model_name, + runner="pooling", + dtype=torch.bfloat16, + tensor_parallel_size=1, + max_model_len=100, + enforce_eager=True, + ) as llm_default: default_outputs = llm_default.embed(prompts) check_embeddings_close( diff --git a/tests/models/multimodal/generation/test_granite_speech.py b/tests/models/multimodal/generation/test_granite_speech.py index f528a993f8551..489743c5a29b3 100644 --- a/tests/models/multimodal/generation/test_granite_speech.py +++ b/tests/models/multimodal/generation/test_granite_speech.py @@ -35,10 +35,12 @@ audio_lora_path = MODEL_NAME models = [MODEL_NAME] -@pytest.fixture(autouse=True) -def set_attention_backend_for_rocm(monkeypatch): +@pytest.fixture +def granite_speech_attention_config(): + """Return attention config for Granite Speech tests on ROCm.""" if current_platform.is_rocm(): - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN") + return {"backend": "TRITON_ATTN"} + return None def run_test( @@ -53,6 +55,7 @@ def run_test( num_logprobs: int, tensor_parallel_size: int, distributed_executor_backend: str | None = None, + attention_config: dict | None = None, ): """Inference result should be the same between hf and vllm. @@ -80,6 +83,7 @@ def run_test( enable_lora=True, max_lora_rank=64, enforce_eager=True, + attention_config=attention_config, ) as vllm_model: lora_request = LoRARequest("audio", 1, audio_lora_path) vllm_outputs_per_case = [ @@ -131,6 +135,7 @@ def test_models( vllm_runner, model: str, audio_assets: AudioTestAssets, + granite_speech_attention_config, dtype: str, max_model_len: int, max_tokens: int, @@ -157,4 +162,5 @@ def test_models( max_tokens=max_tokens, num_logprobs=num_logprobs, tensor_parallel_size=1, + attention_config=granite_speech_attention_config, ) diff --git a/tests/models/multimodal/pooling/conftest.py b/tests/models/multimodal/pooling/conftest.py index c5f40cb42ca2a..401bc39b4b109 100644 --- a/tests/models/multimodal/pooling/conftest.py +++ b/tests/models/multimodal/pooling/conftest.py @@ -2,23 +2,17 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Pytest configuration for vLLM pooling tests.""" -import os -import warnings +import pytest from vllm.platforms import current_platform -def pytest_collection_modifyitems(config, items): - """Set FLEX_ATTENTION backend for SigLIP tests on ROCm.""" - if not current_platform.is_rocm(): - return +@pytest.fixture +def siglip_attention_config(): + """Return attention config for SigLIP tests on ROCm. - siglip_tests = [item for item in items if "test_siglip" in item.nodeid] - - if siglip_tests: - os.environ["VLLM_ATTENTION_BACKEND"] = "FLEX_ATTENTION" - warnings.warn( - "ROCm: Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION for SigLIP tests", - UserWarning, - stacklevel=1, - ) + On ROCm, SigLIP tests require FLEX_ATTENTION backend. + """ + if current_platform.is_rocm(): + return {"backend": "FLEX_ATTENTION"} + return None diff --git a/tests/models/multimodal/pooling/test_siglip.py b/tests/models/multimodal/pooling/test_siglip.py index 72886cbf7f323..0b8cd33ccfb9d 100644 --- a/tests/models/multimodal/pooling/test_siglip.py +++ b/tests/models/multimodal/pooling/test_siglip.py @@ -38,6 +38,7 @@ def _run_test( *, dtype: str, tokenization_kwargs: dict[str, Any] | None = None, + attention_config: dict[str, Any] | None = None, ) -> None: if tokenization_kwargs is None: tokenization_kwargs = {} @@ -49,6 +50,7 @@ def _run_test( enforce_eager=True, max_model_len=64, gpu_memory_utilization=0.7, + attention_config=attention_config, ) as vllm_model: vllm_outputs = vllm_model.embed( input_texts, images=input_images, tokenization_kwargs=tokenization_kwargs @@ -90,6 +92,7 @@ def test_models_text( hf_runner, vllm_runner, image_assets, + siglip_attention_config, model: str, dtype: str, ) -> None: @@ -108,6 +111,7 @@ def test_models_text( "padding": "max_length", "max_length": 64, }, # siglip2 was trained with this padding setting. + attention_config=siglip_attention_config, ) @@ -117,6 +121,7 @@ def test_models_image( hf_runner, vllm_runner, image_assets, + siglip_attention_config, model: str, dtype: str, ) -> None: @@ -133,6 +138,7 @@ def test_models_image( input_images, model, dtype=dtype, + attention_config=siglip_attention_config, ) @@ -141,6 +147,7 @@ def test_models_image( def test_models_text_image_no_crash( vllm_runner, image_assets, + siglip_attention_config, model: str, dtype: str, ) -> None: @@ -154,6 +161,7 @@ def test_models_text_image_no_crash( enforce_eager=True, max_model_len=64, gpu_memory_utilization=0.7, + attention_config=siglip_attention_config, ) as vllm_model: with pytest.raises(ValueError, match="not both"): vllm_model.embed(texts, images=images) diff --git a/tests/models/quantization/test_fp8.py b/tests/models/quantization/test_fp8.py index 7dfedaf2799d4..f3b85ba0ee394 100644 --- a/tests/models/quantization/test_fp8.py +++ b/tests/models/quantization/test_fp8.py @@ -75,7 +75,6 @@ def test_models( with monkeypatch.context() as m: m.setenv("TOKENIZERS_PARALLELISM", "true") - m.setenv("VLLM_ATTENTION_BACKEND", backend) MAX_MODEL_LEN = 1024 NUM_LOG_PROBS = 8 @@ -86,6 +85,7 @@ def test_models( tensor_parallel_size=tensor_parallel_size, enforce_eager=enforce_eager, kv_cache_dtype="auto", + attention_config={"backend": backend}, ) as vllm_model: baseline_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, NUM_LOG_PROBS @@ -97,6 +97,7 @@ def test_models( tensor_parallel_size=tensor_parallel_size, enforce_eager=enforce_eager, kv_cache_dtype=kv_cache_dtype, + attention_config={"backend": backend}, ) as vllm_model: test_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, NUM_LOG_PROBS diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index 8c4bd6eaa2dd8..0a573847bf913 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -108,11 +108,12 @@ def can_initialize( patch.object(V1EngineCore, "_initialize_kv_caches", _initialize_kv_caches_v1), monkeypatch.context() as m, ): - if model_arch == "GptOssForCausalLM": - # FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU - # has cc==8.9 which hasn't supported FA3 yet. Remove this hack when - # L4 supports FA3. - m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN") + # FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU + # has cc==8.9 which hasn't supported FA3 yet. Remove this hack when + # L4 supports FA3. + attention_config = ( + {"backend": "TRITON_ATTN"} if model_arch == "GptOssForCausalLM" else None + ) if model_arch == "WhisperForConditionalGeneration": m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") @@ -143,6 +144,7 @@ def can_initialize( else "vllm", hf_overrides=hf_overrides_fn, max_num_seqs=model_info.max_num_seqs, + attention_config=attention_config, ) diff --git a/tests/v1/attention/test_rocm_attention_backends_selection.py b/tests/v1/attention/test_rocm_attention_backends_selection.py index 77790be6f892b..d8c747056faf6 100644 --- a/tests/v1/attention/test_rocm_attention_backends_selection.py +++ b/tests/v1/attention/test_rocm_attention_backends_selection.py @@ -94,26 +94,20 @@ def mock_on_gfx9(): None, AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path(), ), - # Test Case 9: VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 - ( - {"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1"}, - None, - AttentionBackendEnum.ROCM_ATTN.get_path(), - ), - # Test Case 10: VLLM_ROCM_USE_AITER=1 + explicit TRITON_ATTN + # Test Case 9: VLLM_ROCM_USE_AITER=1 + explicit TRITON_ATTN ( {"VLLM_ROCM_USE_AITER": "1"}, "TRITON_ATTN", AttentionBackendEnum.TRITON_ATTN.get_path(), ), - # Test Case 11: VLLM_ROCM_USE_AITER=1 + VLLM_ROCM_USE_AITER_MHA=0 + # Test Case 10: VLLM_ROCM_USE_AITER=1 + VLLM_ROCM_USE_AITER_MHA=0 # (explicitly disabled) ( {"VLLM_ROCM_USE_AITER": "1", "VLLM_ROCM_USE_AITER_MHA": "0"}, None, AttentionBackendEnum.TRITON_ATTN.get_path(), ), - # Test Case 12: VLLM_ROCM_USE_AITER=1 + explicit ROCM_ATTN + # Test Case 11: VLLM_ROCM_USE_AITER=1 + explicit ROCM_ATTN ( {"VLLM_ROCM_USE_AITER": "1"}, "ROCM_ATTN", diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py index 4dcaf9d908690..031436a030908 100644 --- a/tests/v1/attention/utils.py +++ b/tests/v1/attention/utils.py @@ -249,8 +249,8 @@ def create_dummy_kv_cache( @dataclass class BackendConfig: name: str - env_vars: dict - comp_config: dict # compilation config + attention_config: dict + comp_config: dict specific_gpu_arch: tuple | None = None @@ -259,10 +259,10 @@ full_cg_backend_configs = { # FA3 on Hopper "FA3": BackendConfig( name="FA3", - env_vars={ - "VLLM_ATTENTION_BACKEND": "FLASH_ATTN", - "VLLM_FLASH_ATTN_VERSION": "3", - "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16", + attention_config={ + "backend": "FLASH_ATTN", + "flash_attn_version": 3, + "flash_attn_max_num_splits_for_cuda_graph": 16, }, comp_config={ "cudagraph_mode": "FULL", @@ -272,9 +272,7 @@ full_cg_backend_configs = { # FlashMLA on Hopper "FlashMLA": BackendConfig( name="FlashMLA", - env_vars={ - "VLLM_ATTENTION_BACKEND": "FLASHMLA", - }, + attention_config={"backend": "FLASHMLA"}, comp_config={ "cudagraph_mode": "FULL_AND_PIECEWISE", }, @@ -283,9 +281,7 @@ full_cg_backend_configs = { # Cutlass MLA on Blackwell "CutlassMLA": BackendConfig( name="CutlassMLA", - env_vars={ - "VLLM_ATTENTION_BACKEND": "CUTLASS_MLA", - }, + attention_config={"backend": "CUTLASS_MLA"}, comp_config={ "cudagraph_mode": "FULL_AND_PIECEWISE", }, @@ -294,9 +290,7 @@ full_cg_backend_configs = { # FlashInfer MLA on Blackwell "FlashInferMLA": BackendConfig( name="FlashInferMLA", - env_vars={ - "VLLM_ATTENTION_BACKEND": "FLASHINFER_MLA", - }, + attention_config={"backend": "FLASHINFER_MLA"}, comp_config={ "cudagraph_mode": "FULL_AND_PIECEWISE", }, @@ -305,9 +299,9 @@ full_cg_backend_configs = { # FlashAttention MLA on Hopper "FlashAttentionMLA": BackendConfig( name="FlashAttentionMLA", - env_vars={ - "VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA", - "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16", + attention_config={ + "backend": "FLASH_ATTN_MLA", + "flash_attn_max_num_splits_for_cuda_graph": 16, }, comp_config={ "cudagraph_mode": "FULL_DECODE_ONLY", @@ -317,10 +311,10 @@ full_cg_backend_configs = { # FA2 "FA2": BackendConfig( name="FA2", - env_vars={ - "VLLM_ATTENTION_BACKEND": "FLASH_ATTN", - "VLLM_FLASH_ATTN_VERSION": "2", - "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16", + attention_config={ + "backend": "FLASH_ATTN", + "flash_attn_version": 2, + "flash_attn_max_num_splits_for_cuda_graph": 16, }, comp_config={ "cudagraph_mode": "FULL_AND_PIECEWISE", @@ -329,7 +323,7 @@ full_cg_backend_configs = { # Triton Attention "TritonAttn": BackendConfig( name="TritonAttn", - env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN"}, + attention_config={"backend": "TRITON_ATTN"}, comp_config={ "cudagraph_mode": "FULL_AND_PIECEWISE", }, @@ -337,14 +331,17 @@ full_cg_backend_configs = { # FlashInfer "FlashInfer": BackendConfig( name="FlashInfer", - env_vars={"VLLM_ATTENTION_BACKEND": "FLASHINFER"}, + attention_config={"backend": "FLASHINFER"}, comp_config={ "cudagraph_mode": "FULL_AND_PIECEWISE", }, ), "RocmAttn": BackendConfig( name="RocmAttn", - env_vars={"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1"}, + attention_config={ + "backend": "ROCM_ATTN", + "use_prefill_decode_attention": True, + }, comp_config={ "cudagraph_mode": "FULL", }, diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py index b1895e83b8b37..f4f74d16c7019 100644 --- a/tests/v1/cudagraph/test_cudagraph_mode.py +++ b/tests/v1/cudagraph/test_cudagraph_mode.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import contextlib -import os import weakref from contextlib import ExitStack @@ -13,26 +11,6 @@ from vllm import LLM from vllm.config import CompilationConfig, CompilationMode from vllm.platforms import current_platform - -@contextlib.contextmanager -def temporary_environ(env_vars): - """ - Temporarily set environment variables and restore them afterward. - We have to do this vs monkeypatch because monkeypatch doesn't work - with "module" scoped fixtures. - """ - original_env = {k: os.environ.get(k) for k in env_vars} - try: - os.environ.update(env_vars) - yield - finally: - for k, v in original_env.items(): - if v is None: - os.environ.pop(k, None) - else: - os.environ[k] = v - - # test attention backend and cudagraph_mode combo # (backend_name, cudagraph_mode, supported) if current_platform.is_rocm(): @@ -68,9 +46,9 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte ): pytest.skip("Only Hopper GPUs support FA3 and FlashMLA") - env_vars = backend_configs[backend_name].env_vars + attention_config = backend_config.attention_config - with temporary_environ(env_vars), ExitStack() as stack: + with ExitStack() as stack: if not supported: stack.enter_context(pytest.raises(Exception)) @@ -80,6 +58,7 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte trust_remote_code=True, gpu_memory_utilization=0.45, max_model_len=1024, + attention_config=attention_config, compilation_config=CompilationConfig( mode=CompilationMode.VLLM_COMPILE, cudagraph_mode=cudagraph_mode ), @@ -122,9 +101,10 @@ combo_cases_2 = [ def test_cudagraph_compilation_combo( backend_name, cudagraph_mode, compilation_mode, supported ): - env_vars = backend_configs[backend_name].env_vars + backend_config = backend_configs[backend_name] + attention_config = backend_config.attention_config - with temporary_environ(env_vars), ExitStack() as stack: + with ExitStack() as stack: if not supported: stack.enter_context(pytest.raises(Exception)) @@ -134,6 +114,7 @@ def test_cudagraph_compilation_combo( trust_remote_code=True, gpu_memory_utilization=0.45, max_model_len=1024, + attention_config=attention_config, compilation_config=CompilationConfig( mode=compilation_mode, cudagraph_mode=cudagraph_mode ), diff --git a/tests/v1/determinism/test_batch_invariance.py b/tests/v1/determinism/test_batch_invariance.py index 7a58e1c9bad03..61fb5f07303b4 100644 --- a/tests/v1/determinism/test_batch_invariance.py +++ b/tests/v1/determinism/test_batch_invariance.py @@ -28,7 +28,7 @@ IS_DEVICE_CAPABILITY_BELOW_90 = is_device_capability_below_90() BACKENDS, ) def test_v1_generation_is_deterministic_across_batch_sizes_with_needle( - backend, monkeypatch: pytest.MonkeyPatch + backend, ): """ Ensures that the same request (the 'needle' prompt) yields identical output @@ -54,7 +54,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle( seed = int(os.getenv("VLLM_TEST_SEED", "12345")) random.seed(seed) - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend) + attention_config = {"backend": backend} # Allow overrides from environment (useful for CI tuning) # "facebook/opt-125m" is too small, doesn't reliably test determinism model = resolve_model_name(backend) @@ -92,6 +92,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle( max_num_seqs=max_batch_size, gpu_memory_utilization=gpu_mem_util, max_model_len=max_model_len, + attention_config=attention_config, ) # Baseline generation for the needle prompt alone. @@ -106,6 +107,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle( max_num_seqs=max_batch_size, gpu_memory_utilization=gpu_mem_util, max_model_len=max_model_len, + attention_config=attention_config, ) mismatches = 0 @@ -163,10 +165,8 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle( BACKENDS, ) def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN( - backend, monkeypatch: pytest.MonkeyPatch + backend, ): - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend) - seed = int(os.getenv("VLLM_TEST_SEED", "12345")) random.seed(seed) model_name = resolve_model_name(backend) @@ -193,6 +193,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN( dtype="bfloat16", # not everything is supported gpu_memory_utilization=0.9, enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90, + attention_config={"backend": backend}, ) # Use more realistic prompts for better token generation @@ -381,12 +382,11 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN( "backend", BACKENDS, ) -def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch): +def test_simple_generation(backend): """ Simple test that runs the model with a basic prompt and prints the output. Useful for quick smoke testing and debugging. """ - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend) model = resolve_model_name(backend) llm = LLM( @@ -398,6 +398,7 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch): dtype="bfloat16", enable_prefix_caching=False, enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90, + attention_config={"backend": backend}, ) prompt = "the capital of france is" @@ -444,8 +445,6 @@ def test_logprobs_without_batch_invariance_should_fail( The test will PASS if we detect differences (proving batch invariance matters). The test will FAIL if everything matches (suggesting batch invariance isn't needed). """ - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend) - # CRITICAL: Disable batch invariance for this test monkeypatch.setenv("VLLM_BATCH_INVARIANT", "0") monkeypatch.setattr(batch_invariant, "VLLM_BATCH_INVARIANT", False) @@ -465,6 +464,7 @@ def test_logprobs_without_batch_invariance_should_fail( max_model_len=8192, dtype="bfloat16", enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90, + attention_config={"backend": backend}, ) # build ragged prompts to change shapes significantly across BS=1 vs BS=N @@ -649,7 +649,7 @@ def test_logprobs_without_batch_invariance_should_fail( @skip_unsupported @pytest.mark.parametrize("backend", ["FLASH_ATTN"]) def test_decode_logprobs_match_prefill_logprobs( - backend, monkeypatch: pytest.MonkeyPatch + backend, ): """ Test that verifies decode logprobs match prefill logprobs. @@ -664,8 +664,6 @@ def test_decode_logprobs_match_prefill_logprobs( This ensures that the logprobs from decode are consistent with what we would get if we ran prefill on each prefix. """ - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend) - seed = int(os.getenv("VLLM_TEST_SEED", "12345")) random.seed(seed) model_name = resolve_model_name(backend) @@ -689,6 +687,7 @@ def test_decode_logprobs_match_prefill_logprobs( max_model_len=8192, dtype="bfloat16", enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90, + attention_config={"backend": backend}, ) # Use a few test prompts @@ -920,6 +919,7 @@ def LLM_with_max_seqs( max_num_seqs: int, gpu_memory_utilization: float, max_model_len: int, + attention_config: dict | None = None, ) -> LLM: """ Helper to construct an LLM with a specific max_num_seqs (batch-size limit) @@ -934,6 +934,7 @@ def LLM_with_max_seqs( tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")), enable_prefix_caching=False, enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90, + attention_config=attention_config, # Enable for MOE models # enable_expert_parallel=True, ) diff --git a/tests/v1/determinism/test_online_batch_invariance.py b/tests/v1/determinism/test_online_batch_invariance.py index 5e3b997364949..52c8103b2f1ce 100644 --- a/tests/v1/determinism/test_online_batch_invariance.py +++ b/tests/v1/determinism/test_online_batch_invariance.py @@ -136,11 +136,9 @@ def _compare_bs1_vs_bsn_single_process( @skip_unsupported @pytest.mark.parametrize("backend", BACKENDS) def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN( - backend: str, monkeypatch: pytest.MonkeyPatch + backend: str, ) -> None: random.seed(int(os.getenv("VLLM_TEST_SEED", "12345"))) - # Override backend for this test (and the RemoteOpenAIServer child process). - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend) model_name = resolve_model_name(backend) prompts_all = [_random_prompt(10, 50) for _ in range(32)] @@ -156,6 +154,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN( server_args: list[str] = [ "--max-model-len=8192", "--max-num-seqs=32", + f"--attention-backend={backend}", ] if tp_size: server_args += ["-tp", tp_size] diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py index 5cef9b33c9984..61e56c079a3b5 100644 --- a/tests/v1/e2e/test_async_scheduling.py +++ b/tests/v1/e2e/test_async_scheduling.py @@ -142,16 +142,17 @@ def run_tests( """Test consistency of combos of async scheduling, preemption, uni/multiproc executor with spec decoding.""" - with monkeypatch.context() as m: - # avoid precision errors - if current_platform.is_rocm(): - if is_testing_with_spec_decoding: - # Use TRITON_ATTN for spec decoding test for consistency - m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN") - else: - m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_AITER_FA") + # Determine attention config based on platform + if current_platform.is_rocm(): + if is_testing_with_spec_decoding: + # Use TRITON_ATTN for spec decoding test for consistency + attention_config = {"backend": "TRITON_ATTN"} else: - m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION") + attention_config = {"backend": "ROCM_AITER_FA"} + else: + attention_config = {"backend": "FLEX_ATTENTION"} + + with monkeypatch.context() as m: # lock matmul precision to full FP32 (IEEE) m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "ieee") # m.setenv("VLLM_BATCH_INVARIANT", "1") @@ -174,6 +175,7 @@ def run_tests( spec_config, test_prefill_chunking=test_prefill_chunking, is_testing_with_spec_decoding=is_testing_with_spec_decoding, + attention_config=attention_config, ) outputs.append(test_results) @@ -262,6 +264,7 @@ def run_test( spec_config: dict[str, Any] | None, test_prefill_chunking: bool, is_testing_with_spec_decoding: bool = False, + attention_config: dict[str, Any] | None = None, ): spec_decoding = spec_config is not None cache_arg: dict[str, Any] = ( @@ -301,6 +304,7 @@ def run_test( dtype=dtype, speculative_config=spec_config, disable_log_stats=False, + attention_config=attention_config, **cache_arg, ) as vllm_model: results = [] diff --git a/tests/v1/e2e/test_cascade_attention.py b/tests/v1/e2e/test_cascade_attention.py index 0fcb97fe63055..a7be981805c0d 100644 --- a/tests/v1/e2e/test_cascade_attention.py +++ b/tests/v1/e2e/test_cascade_attention.py @@ -10,7 +10,7 @@ from ...utils import create_new_process_for_each_test @create_new_process_for_each_test() @pytest.mark.parametrize("attn_backend", ["FLASH_ATTN", "FLASHINFER"]) -def test_cascade_attention(example_system_message, monkeypatch, attn_backend): +def test_cascade_attention(example_system_message, attn_backend): prompt = "\n: Implement fibonacci sequence in Python.\n:" if attn_backend == "FLASHINFER": @@ -19,19 +19,18 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend): "needs investigation. See issue #25679." ) - with monkeypatch.context() as m: - m.setenv("VLLM_ATTENTION_BACKEND", attn_backend) + llm = LLM( + model="Qwen/Qwen2-1.5B-Instruct", attention_config={"backend": attn_backend} + ) + sampling_params = SamplingParams(temperature=0.0, max_tokens=100) - llm = LLM(model="Qwen/Qwen2-1.5B-Instruct") - sampling_params = SamplingParams(temperature=0.0, max_tokens=100) + # No cascade attention. + single_prompt = [example_system_message + prompt] + responses = llm.generate(single_prompt, sampling_params) + ref_output = responses[0].outputs[0].text - # No cascade attention. - single_prompt = [example_system_message + prompt] - responses = llm.generate(single_prompt, sampling_params) - ref_output = responses[0].outputs[0].text - - # (Probably) Use cascade attention. - prompts = [example_system_message + prompt] * 64 - responses = llm.generate(prompts, sampling_params) - for response in responses: - assert response.outputs[0].text == ref_output + # (Probably) Use cascade attention. + prompts = [example_system_message + prompt] * 64 + responses = llm.generate(prompts, sampling_params) + for response in responses: + assert response.outputs[0].text == ref_output diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index fcfc8bdce12e9..a25114a4d96cb 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -438,25 +438,26 @@ def test_eagle_correctness( should be the same when using eagle speculative decoding. model_setup: (method, model_name, eagle_model_name, tp_size) """ + # Determine attention config + # Scout requires default backend selection because vision encoder has + # head_dim 88 being incompatible with FLASH_ATTN and needs to fall back + # to Flex Attn + if "Llama-4-Scout" in model_setup[1] and attn_backend == "FLASH_ATTN": + if current_platform.is_rocm(): + # TODO: Enable Flex Attn for spec_decode on ROCm + pytest.skip("Flex Attn for spec_decode not supported on ROCm currently") + attention_config = None # Let it fall back to default + else: + attention_config = {"backend": attn_backend} + + if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm(): + pytest.skip( + "TRITON_ATTN does not support " + "multi-token eagle spec decode on current platform" + ) + with monkeypatch.context() as m: - if "Llama-4-Scout" in model_setup[1] and attn_backend == "FLASH_ATTN": - # Scout requires default backend selection - # because vision encoder has head_dim 88 being incompatible - # with FLASH_ATTN and needs to fall back to Flex Attn - - # pass if not ROCm - if current_platform.is_rocm(): - # TODO: Enable Flex Attn for spec_decode on ROCm - pytest.skip("Flex Attn for spec_decode not supported on ROCm currently") - else: - m.setenv("VLLM_MLA_DISABLE", "1") - m.setenv("VLLM_ATTENTION_BACKEND", attn_backend) - - if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm(): - pytest.skip( - "TRITON_ATTN does not support " - "multi-token eagle spec decode on current platform" - ) + m.setenv("VLLM_MLA_DISABLE", "1") if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm(): if "deepseek" in model_setup[1].lower(): @@ -471,7 +472,10 @@ def test_eagle_correctness( max_num_batched_tokens = 128 if enable_chunked_prefill else max_model_len ref_llm = LLM( - model=model_name, max_model_len=max_model_len, tensor_parallel_size=tp_size + model=model_name, + max_model_len=max_model_len, + tensor_parallel_size=tp_size, + attention_config=attention_config, ) ref_outputs = ref_llm.chat(test_prompts, sampling_config) del ref_llm @@ -492,6 +496,7 @@ def test_eagle_correctness( max_num_batched_tokens=max_num_batched_tokens, enable_chunked_prefill=enable_chunked_prefill, model_impl=model_impl, + attention_config=attention_config, ) spec_outputs = spec_llm.chat(test_prompts, sampling_config) matches = 0 diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh index 453ccc81eb14a..c2c38f51c5003 100755 --- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh +++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh @@ -3,21 +3,29 @@ set -xe # Parse command line arguments KV_BUFFER_DEVICE="cuda" # Default to cuda +ATTENTION_BACKEND="" # Default to empty (use vllm default) while [[ $# -gt 0 ]]; do case $1 in --kv_buffer_device) KV_BUFFER_DEVICE="$2" shift 2 ;; + --attention-backend) + ATTENTION_BACKEND="$2" + shift 2 + ;; *) echo "Unknown option $1" - echo "Usage: $0 [--kv_buffer_device ]" + echo "Usage: $0 [--kv_buffer_device ] [--attention-backend ]" exit 1 ;; esac done echo "Running accuracy tests with kv_buffer_device=$KV_BUFFER_DEVICE" +if [[ -n "$ATTENTION_BACKEND" ]]; then + echo "Using attention backend: $ATTENTION_BACKEND" +fi DECODER_KV_LAYOUT=${DECODER_KV_LAYOUT:-"HND"} # Default to HND, optional NHD if [[ "$DECODER_KV_LAYOUT" == "NHD" ]]; then @@ -148,6 +156,11 @@ run_tests_for_model() { --tensor-parallel-size $PREFILLER_TP_SIZE \ --kv-transfer-config '$KV_CONFIG'" + # Add attention backend config if specified + if [[ -n "$ATTENTION_BACKEND" ]]; then + BASE_CMD="${BASE_CMD} --attention-backend=$ATTENTION_BACKEND" + fi + if [ -n "$model_args" ]; then FULL_CMD="$BASE_CMD $model_args" else @@ -188,7 +201,12 @@ run_tests_for_model() { --block-size ${DECODE_BLOCK_SIZE} \ --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \ --kv-transfer-config '$KV_CONFIG'" - + + # Add attention backend config if specified + if [[ -n "$ATTENTION_BACKEND" ]]; then + BASE_CMD="${BASE_CMD} --attention-backend=$ATTENTION_BACKEND" + fi + # DP-EP attention mode if [[ -z "$DP_EP" ]]; then BASE_CMD="${BASE_CMD} --tensor-parallel-size $DECODER_TP_SIZE" diff --git a/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh index 9308c81da0635..8199fd516cd43 100755 --- a/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh +++ b/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh @@ -15,14 +15,14 @@ configs=( run_tests() { local label=$1 - local extra_env=$2 + local extra_args=$2 echo "=== Running tests (${label}) ===" for cfg in "${configs[@]}"; do - echo "-> Running with ${cfg} ${extra_env:+and ${extra_env}}" + echo "-> Running with ${cfg} ${extra_args:+and ${extra_args}}" # Use 'env' to safely set variables without eval - if ! env ${extra_env} ${cfg} bash "${SCRIPT}"; then - echo "❌ Test failed for config: ${cfg} ${extra_env:+(${extra_env})}" + if ! env ${cfg} bash "${SCRIPT}" ${extra_args}; then + echo "❌ Test failed for config: ${cfg} ${extra_args:+(${extra_args})}" exit 1 fi done @@ -34,8 +34,8 @@ run_tests "default backend" "" # Check if FLASHINFER is set (non-empty) if [[ -n "${FLASHINFER:-}" ]]; then - echo "FLASHINFER is set, rerunning with VLLM_ATTENTION_BACKEND=FLASHINFER" - run_tests "FLASHINFER backend" "VLLM_ATTENTION_BACKEND=FLASHINFER" + echo "FLASHINFER is set, rerunning with --attention-backend FLASHINFER" + run_tests "FLASHINFER backend" "--attention-backend FLASHINFER" else echo "FLASHINFER not set, skipping FLASHINFER runs." fi diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index 66804fa671c7c..25f4308079595 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -1132,7 +1132,7 @@ def _run_abort_timeout_test(llm: LLM, timeout: int): "TRITON_ATTN", ], ) -def test_register_kv_caches(dist_init, attn_backend, monkeypatch): +def test_register_kv_caches(dist_init, attn_backend): """ Test that register_kv_caches() properly calls nixl_wrapper methods with correct data. @@ -1144,9 +1144,7 @@ def test_register_kv_caches(dist_init, attn_backend, monkeypatch): block layout info """ - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend) - - vllm_config = create_vllm_config() + vllm_config = create_vllm_config(attention_backend=attn_backend) # Import the appropriate backend based on the parameter if attn_backend == "FLASH_ATTN": diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index 5cdb1f84b30d4..3a0dbb8e43b52 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -11,6 +11,7 @@ import torch from vllm import SamplingParams from vllm.config import ( + AttentionConfig, CacheConfig, DeviceConfig, KVTransferConfig, @@ -94,6 +95,7 @@ def create_vllm_config( dtype: str = "float16", cache_dtype: str = "auto", hf_overrides: dict[str, Any] | None = None, + attention_backend: str | None = None, ) -> VllmConfig: """Initialize VllmConfig For Testing.""" model_config = ModelConfig( @@ -124,12 +126,14 @@ def create_vllm_config( enable_permute_local_kv=enable_permute_local_kv, kv_connector_extra_config=kv_connector_extra_config or {}, ) + attention_config = AttentionConfig(backend=attention_backend) return VllmConfig( scheduler_config=scheduler_config, model_config=model_config, cache_config=cache_config, kv_transfer_config=kv_transfer_config, device_config=DeviceConfig("cpu"), + attention_config=attention_config, ) diff --git a/tests/v1/kv_offload/test_cpu_offloading.py b/tests/v1/kv_offload/test_cpu_offloading.py index 57474a3dc01e7..1ac5e5b8cdc57 100644 --- a/tests/v1/kv_offload/test_cpu_offloading.py +++ b/tests/v1/kv_offload/test_cpu_offloading.py @@ -13,7 +13,6 @@ from vllm import LLM, SamplingParams, TokensPrompt from vllm.config import KVEventsConfig, KVTransferConfig from vllm.distributed.kv_events import BlockStored, KVEventBatch from vllm.platforms import current_platform -from vllm.utils.system_utils import set_env_var CPU_BLOCK_SIZES = [48] ATTN_BACKENDS = ["FLASH_ATTN"] @@ -180,13 +179,13 @@ def test_cpu_offloading(cpu_block_size: int, attn_backend: str) -> None: topic="test", ) - with set_env_var("VLLM_ATTENTION_BACKEND", attn_backend): - llm = LLM( - model="meta-llama/Llama-3.2-1B-Instruct", - gpu_memory_utilization=0.5, - kv_events_config=kv_events_config, - kv_transfer_config=kv_transfer_config, - ) + llm = LLM( + model="meta-llama/Llama-3.2-1B-Instruct", + gpu_memory_utilization=0.5, + kv_events_config=kv_events_config, + kv_transfer_config=kv_transfer_config, + attention_config={"backend": attn_backend}, + ) events_endpoint = events_endpoint.replace("*", "127.0.0.1") subscriber = MockSubscriber(events_endpoint, topic=kv_events_config.topic) diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index 55e9b4d0660f5..f63cd3a6e42aa 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -15,6 +15,7 @@ from tests.v1.attention.utils import ( ) from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config import ( + AttentionConfig, CacheConfig, DeviceConfig, ModelConfig, @@ -38,6 +39,7 @@ eagle3_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B" def _create_proposer( method: str, num_speculative_tokens: int, + attention_backend: str | None = None, speculative_token_tree: list[tuple[int, ...]] | None = None, ) -> EagleProposer: model_config = ModelConfig(model=model_dir, runner="generate", max_model_len=100) @@ -70,6 +72,7 @@ def _create_proposer( max_model_len=model_config.max_model_len, is_encoder_decoder=model_config.is_encoder_decoder, ), + attention_config=AttentionConfig(backend=attention_backend), ) return EagleProposer(vllm_config=vllm_config, device=current_platform.device_type) @@ -331,8 +334,6 @@ def test_load_model( use_distinct_lm_head, monkeypatch, ): - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend) - if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm(): pytest.skip( "TRITON_ATTN does not support " @@ -394,7 +395,9 @@ def test_load_model( assert not isinstance(target_model, SupportsMultiModal) # Create proposer using the helper function - proposer = _create_proposer(method, num_speculative_tokens=8) + proposer = _create_proposer( + method, num_speculative_tokens=8, attention_backend=attn_backend + ) # Call the method under test proposer.load_model(target_model) @@ -420,8 +423,6 @@ def test_load_model( @pytest.mark.parametrize("attn_backend", get_attn_backend_list_based_on_platform()) @pytest.mark.parametrize("num_speculative_tokens", [1, 3, 8]) def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch): - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend) - if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm(): pytest.skip( "TRITON_ATTN does not support " @@ -449,7 +450,9 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch): seq_lens = [seq_len_1, seq_len_2] # Create proposer first so we can use its actual hidden_size - proposer = _create_proposer("eagle", num_speculative_tokens) + proposer = _create_proposer( + "eagle", num_speculative_tokens, attention_backend=attn_backend + ) # Get the hidden_size from the proposer to ensure consistency hidden_size = proposer.hidden_size @@ -622,7 +625,9 @@ def test_propose_tree(spec_token_tree): # Create proposer first so we can use its actual hidden_size. proposer = _create_proposer( - "eagle", num_speculative_tokens, speculative_token_tree=spec_token_tree + "eagle", + num_speculative_tokens, + speculative_token_tree=spec_token_tree, ) # Get the hidden_size from the proposer to ensure consistency. hidden_size = proposer.hidden_size diff --git a/tests/v1/spec_decode/test_max_len.py b/tests/v1/spec_decode/test_max_len.py index 15a6bd2659ea9..42991f9f1ae03 100644 --- a/tests/v1/spec_decode/test_max_len.py +++ b/tests/v1/spec_decode/test_max_len.py @@ -38,53 +38,48 @@ def test_ngram_max_len(num_speculative_tokens: int): def test_eagle_max_len( monkeypatch: pytest.MonkeyPatch, num_speculative_tokens: int, attn_backend: str ): - with monkeypatch.context() as m: - m.setenv("VLLM_ATTENTION_BACKEND", attn_backend) - - if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm(): - pytest.skip( - "TRITON_ATTN does not support " - "multi-token eagle spec decode on current platform" - ) - - if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm(): - m.setenv("VLLM_ROCM_USE_AITER", "1") - - llm = LLM( - model="meta-llama/Meta-Llama-3-8B-Instruct", - enforce_eager=True, # For faster initialization. - speculative_config={ - "method": "eagle", - "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B", - "num_speculative_tokens": num_speculative_tokens, - "max_model_len": 80, - }, - max_model_len=200, + if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm(): + pytest.skip( + "TRITON_ATTN does not support " + "multi-token eagle spec decode on current platform" ) - sampling_params = SamplingParams(max_tokens=200, ignore_eos=True) - outputs = llm.generate(_PROMPTS, sampling_params) - for o in outputs: - assert o.outputs[0].finish_reason == "length", ( - "This test is only meaningful if the output " - "is truncated due to max length" - ) - sampling_params = SamplingParams( - max_tokens=200, - structured_outputs=StructuredOutputsParams( - regex="^" + "a b c d e " * 15 + "$" - ), + if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm(): + monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") + + llm = LLM( + model="meta-llama/Meta-Llama-3-8B-Instruct", + enforce_eager=True, # For faster initialization. + speculative_config={ + "method": "eagle", + "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B", + "num_speculative_tokens": num_speculative_tokens, + "max_model_len": 80, + }, + max_model_len=200, + attention_config={"backend": attn_backend}, + ) + sampling_params = SamplingParams(max_tokens=200, ignore_eos=True) + outputs = llm.generate(_PROMPTS, sampling_params) + for o in outputs: + assert o.outputs[0].finish_reason == "length", ( + "This test is only meaningful if the output is truncated due to max length" ) - output = llm.generate(_PROMPTS, sampling_params) - for o in output: - assert o.prompt_token_ids is not None - assert ( - len(o.prompt_token_ids) - < 80 - < len(o.prompt_token_ids) + len(o.outputs[0].token_ids) - <= 200 - ), ( - "This test is only meaningful if the output " - "is longer than the eagle max length" - ) - assert o.outputs[0].text == "a b c d e " * 15 + + sampling_params = SamplingParams( + max_tokens=200, + structured_outputs=StructuredOutputsParams(regex="^" + "a b c d e " * 15 + "$"), + ) + output = llm.generate(_PROMPTS, sampling_params) + for o in output: + assert o.prompt_token_ids is not None + assert ( + len(o.prompt_token_ids) + < 80 + < len(o.prompt_token_ids) + len(o.outputs[0].token_ids) + <= 200 + ), ( + "This test is only meaningful if the output " + "is longer than the eagle max length" + ) + assert o.outputs[0].text == "a b c d e " * 15 diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py index e2410a70b1a63..e231c600cba7a 100644 --- a/vllm/v1/attention/backends/rocm_attn.py +++ b/vllm/v1/attention/backends/rocm_attn.py @@ -165,7 +165,7 @@ class RocmAttentionBackend(AttentionBackend): raise ValueError( f"Head size {head_size} is not supported by {attn_type}. " f"Supported head sizes are: {cls.get_supported_head_sizes()}. " - "Set --attention-config.backend=FLEX_ATTENTION to use " + "Set --attention-backend=FLEX_ATTENTION to use " "FlexAttention backend which supports all head sizes." ) From e3a0f21e6ce78268865cafcdc3dc58c7a80dbc57 Mon Sep 17 00:00:00 2001 From: Xunzhuo Date: Thu, 18 Dec 2025 02:45:56 +0800 Subject: [PATCH 031/176] [docs]: add ecosystem projects sr in docs/governance (#30844) Signed-off-by: bitliu --- docs/governance/committers.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/governance/committers.md b/docs/governance/committers.md index c9428027da953..2f0780a08978b 100644 --- a/docs/governance/committers.md +++ b/docs/governance/committers.md @@ -181,3 +181,4 @@ If you have PRs touching the area, please feel free to ping the area owner for r - Ascend NPU: [@wangxiyuan](https://github.com/wangxiyuan) and [see more details](https://vllm-ascend.readthedocs.io/en/latest/community/contributors.html#maintainers) - Intel Gaudi HPU [@xuechendi](https://github.com/xuechendi) and [@kzawora-intel](https://github.com/kzawora-intel) +- Semantic Router: [@xunzhuo](https://github.com/xunzhuo), [@rootfs](https://github.com/rootfs) and [see more details](https://vllm-semantic-router.com/community/team) From e06d0bf0aa2af11220b5c3aa5ccc8f999d0e3161 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Wed, 17 Dec 2025 15:20:22 -0500 Subject: [PATCH 032/176] 2.9.1 PyTorch release update (#28495) --- .buildkite/test-amd.yaml | 2 +- .buildkite/test-pipeline.yaml | 2 +- CMakeLists.txt | 4 ++-- pyproject.toml | 2 +- requirements/build.txt | 2 +- requirements/cuda.txt | 6 +++--- requirements/rocm-build.txt | 8 ++++---- requirements/test.in | 6 +++--- requirements/test.txt | 8 ++++---- vllm/model_executor/layers/conv.py | 2 +- 10 files changed, 21 insertions(+), 21 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index e8f99100a8de0..6df373632d730 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -740,7 +740,7 @@ steps: # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now # we can only upgrade after this is resolved # TODO(jerryzh168): resolve the above comment - - uv pip install --system torchao==0.13.0 + - uv pip install --system torchao==0.14.1 - uv pip install --system conch-triton-kernels - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index b4de630b09417..8e3bcfe4a36bc 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -658,7 +658,7 @@ steps: # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now # we can only upgrade after this is resolved # TODO(jerryzh168): resolve the above comment - - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129 + - uv pip install --system torchao==0.14.1 --index-url https://download.pytorch.org/whl/cu129 - uv pip install --system conch-triton-kernels - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 5ca71f6ba4df0..a14496e035d9a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -56,8 +56,8 @@ endif() # requirements.txt files and should be kept consistent. The ROCm torch # versions are derived from docker/Dockerfile.rocm # -set(TORCH_SUPPORTED_VERSION_CUDA "2.9.0") -set(TORCH_SUPPORTED_VERSION_ROCM "2.9.0") +set(TORCH_SUPPORTED_VERSION_CUDA "2.9.1") +set(TORCH_SUPPORTED_VERSION_ROCM "2.9.1") # # Try to find python package with an executable that exactly matches diff --git a/pyproject.toml b/pyproject.toml index a250ab6567f12..c03f96dd7acd5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "packaging>=24.2", "setuptools>=77.0.3,<81.0.0", "setuptools-scm>=8.0", - "torch == 2.9.0", + "torch == 2.9.1", "wheel", "jinja2", ] diff --git a/requirements/build.txt b/requirements/build.txt index 23ff8d4fdc1c0..3756371638bad 100644 --- a/requirements/build.txt +++ b/requirements/build.txt @@ -4,7 +4,7 @@ ninja packaging>=24.2 setuptools>=77.0.3,<81.0.0 setuptools-scm>=8 -torch==2.9.0 +torch==2.9.1 wheel jinja2>=3.1.6 regex diff --git a/requirements/cuda.txt b/requirements/cuda.txt index 462f18ef7159b..1417fb99120bc 100644 --- a/requirements/cuda.txt +++ b/requirements/cuda.txt @@ -5,9 +5,9 @@ numba == 0.61.2 # Required for N-gram speculative decoding # Dependencies for NVIDIA GPUs ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1. -torch==2.9.0 -torchaudio==2.9.0 +torch==2.9.1 +torchaudio==2.9.1 # These must be updated alongside torch -torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version +torchvision==0.24.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version # FlashInfer should be updated together with the Dockerfile flashinfer-python==0.5.3 diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt index b977e80be067f..54af9d995c4a2 100644 --- a/requirements/rocm-build.txt +++ b/requirements/rocm-build.txt @@ -2,11 +2,11 @@ -r common.txt --extra-index-url https://download.pytorch.org/whl/rocm6.4 -torch==2.9.0 -torchvision==0.24.0 -torchaudio==2.9.0 +torch==2.9.1 +torchvision==0.24.1 +torchaudio==2.9.1 -triton==3.5.0 +triton==3.5.1 cmake>=3.26.1,<4 packaging>=24.2 setuptools>=77.0.3,<80.0.0 diff --git a/requirements/test.in b/requirements/test.in index dfae5b75821f8..55452ce83f232 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -24,9 +24,9 @@ soundfile # required for audio tests jiwer # required for audio tests tblib # for pickling test exceptions timm >=1.0.17 # required for internvl and gemma3n-mm test -torch==2.9.0 -torchaudio==2.9.0 -torchvision==0.24.0 +torch==2.9.1 +torchaudio==2.9.1 +torchvision==0.24.1 transformers_stream_generator # required for qwen-vl test matplotlib # required for qwen-vl test mistral_common[image,audio] >= 1.8.5 # required for voxtral test diff --git a/requirements/test.txt b/requirements/test.txt index 571194e05c1ba..ea2093e4347fe 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1123,7 +1123,7 @@ tomli==2.2.1 # via schemathesis tomli-w==1.2.0 # via schemathesis -torch==2.9.0+cu129 +torch==2.9.1+cu129 # via # -r requirements/test.in # accelerate @@ -1152,7 +1152,7 @@ torch==2.9.0+cu129 # torchvision # vector-quantize-pytorch # vocos -torchaudio==2.9.0+cu129 +torchaudio==2.9.1+cu129 # via # -r requirements/test.in # encodec @@ -1165,7 +1165,7 @@ torchmetrics==1.7.4 # pytorch-lightning # terratorch # torchgeo -torchvision==0.24.0+cu129 +torchvision==0.24.1+cu129 # via # -r requirements/test.in # lightly @@ -1206,7 +1206,7 @@ transformers==4.57.3 # transformers-stream-generator transformers-stream-generator==0.0.5 # via -r requirements/test.in -triton==3.5.0 +triton==3.5.1 # via torch tritonclient==2.51.0 # via diff --git a/vllm/model_executor/layers/conv.py b/vllm/model_executor/layers/conv.py index 8d51e5bd9920a..1cd02698b3863 100644 --- a/vllm/model_executor/layers/conv.py +++ b/vllm/model_executor/layers/conv.py @@ -251,6 +251,6 @@ class Conv3dLayer(ConvLayerBase): # See: https://github.com/vllm-project/vllm/issues/27406 # and https://github.com/pytorch/pytorch/issues/166122 # By default, we use CUDNN's convolution ops with optimization. - if self.enable_linear and is_torch_equal("2.9.0"): + if self.enable_linear and (is_torch_equal("2.9.0") or is_torch_equal("2.9.1")): return self._forward_mulmat(x) return self._forward_conv(x) From e3fc374a9a69dddb16885d810f1e28d3fdd39ebd Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Wed, 17 Dec 2025 18:00:59 -0500 Subject: [PATCH 033/176] [BugFix] Workspace allocation during profile run : DeepEPHighThroughput + DeepGEMM (#30899) --- vllm/model_executor/layers/fused_moe/modular_kernel.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 484314091cb15..b0834e861338f 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -795,7 +795,10 @@ class FusedMoEModularKernel(torch.nn.Module): top_k, global_num_experts, local_num_experts, - expert_tokens_meta, + # expert_tokens_meta help in allocating optimal/minimal + # amount of workspace. Mark it None, so we allocate for + # the worst-case scenario. + expert_tokens_meta=None, ) ) From 05a83dc6ee84be55fef73d5fa6a77fb56d2dd80f Mon Sep 17 00:00:00 2001 From: Nathan Price <125999937+TheCodeWrangler@users.noreply.github.com> Date: Wed, 17 Dec 2025 18:01:29 -0600 Subject: [PATCH 034/176] feat(api): Eager chat template warmup to eliminate first-request latency (#30700) Signed-off-by: Nathan Price --- vllm/entrypoints/openai/api_server.py | 3 ++ vllm/entrypoints/openai/serving_chat.py | 49 +++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index bca9571e39344..d45773f5364e3 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1082,6 +1082,9 @@ async def init_app_state( if "generate" in supported_tasks else None ) + # Warm up chat template processing to avoid first-request latency + if state.openai_serving_chat is not None: + await state.openai_serving_chat.warmup() state.openai_serving_completion = ( OpenAIServingCompletion( engine_client, diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 98fc7810faf96..95df373502bfd 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -162,6 +162,55 @@ class OpenAIServingChat(OpenAIServing): self.supports_code_interpreter = False self.python_tool = None + async def warmup(self) -> None: + """ + Warm up the chat template processing to avoid first-request latency. + + This method triggers Jinja2 template compilation and content format + detection that would otherwise happen on the first real request, + causing increased latency on the first request. + """ + logger.info("Warming up chat template processing...") + start_time = time.perf_counter() + + try: + # Get the tokenizer from the engine + tokenizer = await self.engine_client.get_tokenizer() + + # Create a minimal dummy request + dummy_request = ChatCompletionRequest( + messages=[{"role": "user", "content": "warmup"}], + model=None, + max_completion_tokens=1, + ) + + # Call _preprocess_chat to trigger template compilation + # This forces: + # 1. Chat template content format detection + # 2. Jinja2 template compilation + # 3. Tokenizer initialization for chat + await self._preprocess_chat( + dummy_request, + tokenizer, + dummy_request.messages, + chat_template=self.chat_template, + chat_template_content_format=self.chat_template_content_format, + add_generation_prompt=True, + continue_final_message=False, + tool_dicts=None, + documents=None, + chat_template_kwargs=None, + tool_parser=None, + add_special_tokens=False, + ) + + elapsed = (time.perf_counter() - start_time) * 1000 + logger.info("Chat template warmup completed in %.1fms", elapsed) + + except Exception: + # Log but don't fail server startup if warmup fails + logger.exception("Chat template warmup failed") + async def create_chat_completion( self, request: ChatCompletionRequest, From 74a1ac38b00a8cf502db085d1bbd77712cf47e41 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Thu, 18 Dec 2025 08:05:24 +0800 Subject: [PATCH 035/176] [v1] Add PrefixLM support to TritonAttention backend (#30386) --- .../generation/test_multimodal_gguf.py | 131 ++++++++++---- .../attention/ops/triton_unified_attention.py | 164 +++++++++++++++--- vllm/model_executor/models/gemma3.py | 69 -------- vllm/v1/attention/backends/triton_attn.py | 39 +++++ 4 files changed, 280 insertions(+), 123 deletions(-) diff --git a/tests/models/multimodal/generation/test_multimodal_gguf.py b/tests/models/multimodal/generation/test_multimodal_gguf.py index e596b20c6302b..813dccf1451b5 100644 --- a/tests/models/multimodal/generation/test_multimodal_gguf.py +++ b/tests/models/multimodal/generation/test_multimodal_gguf.py @@ -1,17 +1,23 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Literal, NamedTuple +import os + +os.environ["TOKENIZERS_PARALLELISM"] = "true" + +from typing import Any, NamedTuple import pytest from huggingface_hub import hf_hub_download from pytest import MarkDecorator +from transformers import AutoModelForImageTextToText from tests.quantization.utils import is_quant_method_supported from vllm.assets.image import ImageAsset +from vllm.multimodal.image import rescale_image_size from vllm.utils.torch_utils import set_default_torch_num_threads -from ....conftest import PromptImageInput, VllmRunner +from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner from ...utils import check_logprobs_close @@ -21,9 +27,10 @@ class GGUFMMTestConfig(NamedTuple): gguf_backbone: str gguf_mmproj: str prompt: list[str] - mm_data: dict[Literal["images"], PromptImageInput] + image_names: list[str] # Store names, load PIL images at runtime max_model_len: int = 4096 marks: list[MarkDecorator] = [] + mm_processor_kwargs: dict[str, Any] = {} @property def gguf_model(self): @@ -31,27 +38,75 @@ class GGUFMMTestConfig(NamedTuple): return hf_hub_download(self.gguf_repo, filename=self.gguf_backbone) +# Common prompts aligned with test_common.py "gemma3" entry format +_GEMMA3_PROMPTS = IMAGE_ASSETS.prompts( + { + "stop_sign": ( + "user\n" + "What's the content in the center of the image?" + "\nmodel\n" + ), + "cherry_blossom": ( + "user\n" + "What is the season?" + "\nmodel\n" + ), + } +) + +# Image asset names - load at runtime to avoid pickle issues with subprocess +_GEMMA3_IMAGE_NAMES = ["stop_sign", "cherry_blossom"] + +# Regular multimodal (no pan-and-scan) - uses QAT Q4_0 GGUF GEMMA3_CONFIG = GGUFMMTestConfig( original_model="google/gemma-3-4b-it", gguf_repo="google/gemma-3-4b-it-qat-q4_0-gguf", gguf_backbone="gemma-3-4b-it-q4_0.gguf", gguf_mmproj="mmproj-model-f16-4B.gguf", - prompt=["Describe this image in detail:"], - mm_data={"images": [ImageAsset("stop_sign").pil_image]}, + prompt=_GEMMA3_PROMPTS, + image_names=_GEMMA3_IMAGE_NAMES, + max_model_len=4096, marks=[pytest.mark.core_model], + mm_processor_kwargs={}, ) -MODELS_TO_TEST = [GEMMA3_CONFIG] +# Pan-and-scan multimodal - uses unquantized BF16 GGUF +GEMMA3_CONFIG_PAN_AND_SCAN = GGUFMMTestConfig( + original_model="google/gemma-3-4b-it", + gguf_repo="unsloth/gemma-3-4b-it-GGUF", + gguf_backbone="gemma-3-4b-it-BF16.gguf", + gguf_mmproj="mmproj-BF16.gguf", + prompt=_GEMMA3_PROMPTS, + image_names=_GEMMA3_IMAGE_NAMES, + max_model_len=4096, + marks=[pytest.mark.core_model], + mm_processor_kwargs={"do_pan_and_scan": True}, +) + +MODELS_TO_TEST = [GEMMA3_CONFIG, GEMMA3_CONFIG_PAN_AND_SCAN] def run_multimodal_gguf_test( + hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], model: GGUFMMTestConfig, dtype: str, max_tokens: int, num_logprobs: int, ): - # Run gguf model. + # Load images at runtime (inside subprocess) to avoid pickle issues + images = [ImageAsset(name).pil_image for name in model.image_names] + size_factors = [0.25, 0.5, 1.0] + inputs_per_image = [ + ( + [prompt for _ in size_factors], + [rescale_image_size(image, factor) for factor in size_factors], + ) + for image, prompt in zip(images, model.prompt) + ] + + # NOTE: Run vLLM first to avoid CUDA init issues with multiprocessing fork. + # Run GGUF model via vLLM. with ( set_default_torch_num_threads(1), vllm_runner( @@ -60,35 +115,42 @@ def run_multimodal_gguf_test( tokenizer_name=model.original_model, dtype=dtype, max_model_len=model.max_model_len, + mm_processor_kwargs=model.mm_processor_kwargs, ) as gguf_model, ): - gguf_outputs = gguf_model.generate_greedy_logprobs( - prompts=model.prompt, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - **model.mm_data, - ) + gguf_outputs_per_case = [ + gguf_model.generate_greedy_logprobs( + prompts, + max_tokens, + num_logprobs=num_logprobs, + images=images, + ) + for prompts, images in inputs_per_image + ] - # Run unquantized model. - with vllm_runner( - model_name=model.original_model, - enforce_eager=True, # faster tests + # Then run HfRunner for HuggingFace baseline comparison. + with hf_runner( + model.original_model, dtype=dtype, - max_model_len=model.max_model_len, - ) as original_model: - original_outputs = original_model.generate_greedy_logprobs( - prompts=model.prompt, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - **model.mm_data, - ) + auto_cls=AutoModelForImageTextToText, + ) as hf_model: + hf_outputs_per_case = [ + hf_model.generate_greedy_logprobs_limit( + prompts, + max_tokens, + num_logprobs=num_logprobs, + images=images, + ) + for prompts, images in inputs_per_image + ] - check_logprobs_close( - outputs_0_lst=original_outputs, - outputs_1_lst=gguf_outputs, - name_0="original", - name_1="gguf", - ) + for hf_outputs, gguf_outputs in zip(hf_outputs_per_case, gguf_outputs_per_case): + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=gguf_outputs, + name_0="hf", + name_1="gguf", + ) @pytest.mark.skipif( @@ -105,11 +167,14 @@ def run_multimodal_gguf_test( @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("num_logprobs", [10]) -def test_models( +def test_gemma3_mm_gguf( + hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], model: GGUFMMTestConfig, dtype: str, max_tokens: int, num_logprobs: int, ) -> None: - run_multimodal_gguf_test(vllm_runner, model, dtype, max_tokens, num_logprobs) + run_multimodal_gguf_test( + hf_runner, vllm_runner, model, dtype, max_tokens, num_logprobs + ) diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py index a1877bb4429b9..ae5a48ec3d26d 100644 --- a/vllm/attention/ops/triton_unified_attention.py +++ b/vllm/attention/ops/triton_unified_attention.py @@ -86,6 +86,9 @@ def kernel_unified_attention_2d( USE_SOFTCAP: tl.constexpr, # bool USE_SINKS: tl.constexpr, # bool SLIDING_WINDOW: tl.constexpr, # int + USE_MM_PREFIX: tl.constexpr, # bool + MAX_MM_RANGES: tl.constexpr, # int + mm_prefix_range_ptr, # [num_seqs] - prefix length for each sequence stride_k_cache_0: tl.int64, # int stride_k_cache_1: tl.int64, # int stride_k_cache_2: tl.int64, # int @@ -270,7 +273,38 @@ def kernel_unified_attention_2d( else: V = V_load - seq_mask = seq_offset[None, :] < context_len + query_pos[:, None] + 1 + # Compute attention mask: causal by default (key <= query) + query_abs_pos = context_len + query_pos[:, None] + seq_mask = seq_offset[None, :] <= query_abs_pos + + # Apply sliding window to base mask BEFORE mm_prefix OR. + # Order must match FlexAttention: (causal AND sliding_window) OR mm_prefix + if SLIDING_WINDOW > 0: + seq_mask = seq_mask & ((query_abs_pos - seq_offset) < SLIDING_WINDOW) + + # PrefixLM: extend mask with bidirectional ranges for multimodal tokens. + # Applied AFTER sliding window so mm_prefix ranges override SW restriction. + if USE_MM_PREFIX: + for i in range(MAX_MM_RANGES): + range_start = tl.load( + mm_prefix_range_ptr + seq_idx * MAX_MM_RANGES * 2 + i * 2 + ) + range_end = tl.load( + mm_prefix_range_ptr + seq_idx * MAX_MM_RANGES * 2 + i * 2 + 1 + ) + + is_valid = range_start < range_end + q_in_range = ( + (query_abs_pos >= range_start) + & (query_abs_pos <= range_end) + & is_valid + ) + k_in_range = ( + (seq_offset[None, :] >= range_start) + & (seq_offset[None, :] <= range_end) + & is_valid + ) + seq_mask |= q_in_range & k_in_range # S : (BLOCK_M, TILE_SIZE) S = tl.zeros(shape=(BLOCK_M, TILE_SIZE), dtype=tl.float32) @@ -284,13 +318,6 @@ def kernel_unified_attention_2d( query_mask_1[:, None] & query_mask_0[:, None] & seq_mask, S, float("-inf") ) - if SLIDING_WINDOW > 0: - S = tl.where( - (context_len + query_pos[:, None] - seq_offset) < SLIDING_WINDOW, - S, - float("-inf"), - ) - if USE_ALIBI_SLOPES: S += alibi_slope[:, None] * (seq_offset - context_len) @@ -398,6 +425,9 @@ def kernel_unified_attention_3d( num_seqs: tl.int32, BLOCK_M: tl.constexpr, # int NUM_SEGMENTS_PER_SEQ: tl.constexpr, # int + USE_MM_PREFIX: tl.constexpr, # bool + MAX_MM_RANGES: tl.constexpr, # int + mm_prefix_range_ptr, # [num_seqs] - prefix length for each sequence ): q_block_global_idx = tl.program_id(0) kv_head_idx = tl.program_id(1) @@ -559,7 +589,38 @@ def kernel_unified_attention_3d( else: V = V_load - seq_mask = seq_offset[None, :] < context_len + query_pos[:, None] + 1 + # Compute attention mask: causal by default (key <= query) + query_abs_pos = context_len + query_pos[:, None] + seq_mask = seq_offset[None, :] <= query_abs_pos + + # Apply sliding window to base mask BEFORE mm_prefix OR. + # Order must match FlexAttention: (causal AND sliding_window) OR mm_prefix + if SLIDING_WINDOW > 0: + seq_mask = seq_mask & ((query_abs_pos - seq_offset) < SLIDING_WINDOW) + + # PrefixLM: extend mask with bidirectional ranges for multimodal tokens. + # Applied AFTER sliding window so mm_prefix ranges override SW restriction. + if USE_MM_PREFIX: + for i in range(MAX_MM_RANGES): + range_start = tl.load( + mm_prefix_range_ptr + seq_idx * MAX_MM_RANGES * 2 + i * 2 + ) + range_end = tl.load( + mm_prefix_range_ptr + seq_idx * MAX_MM_RANGES * 2 + i * 2 + 1 + ) + + is_valid = range_start < range_end + q_in_range = ( + (query_abs_pos >= range_start) + & (query_abs_pos <= range_end) + & is_valid + ) + k_in_range = ( + (seq_offset[None, :] >= range_start) + & (seq_offset[None, :] <= range_end) + & is_valid + ) + seq_mask |= q_in_range & k_in_range # S : (BLOCK_M, TILE_SIZE) S = tl.zeros(shape=(BLOCK_M, TILE_SIZE), dtype=tl.float32) @@ -572,13 +633,6 @@ def kernel_unified_attention_3d( query_mask_1[:, None] & query_mask_0[:, None] & seq_mask, S, float("-inf") ) - if SLIDING_WINDOW > 0: - S = tl.where( - (context_len + query_pos[:, None] - seq_offset) < SLIDING_WINDOW, - S, - float("-inf"), - ) - if USE_ALIBI_SLOPES: S += alibi_slope[:, None] * (seq_offset - context_len) @@ -732,6 +786,43 @@ def reduce_segments( tl.store(output_ptr + output_offset, acc, mask=dim_mask) +def _is_gemma3_attention(head_size: int, sliding_window: int) -> bool: + """Detect Gemma3 models via unique (head_size, sliding_window) signature. + + Gemma3 models are the only ones using sliding_window=1024 with + head_size 128 (27B) or 256 (1B, 4B, 12B). Other SWA models use + different window sizes (Mistral=4096, Phi-3=2047). + """ + return sliding_window == 1024 and head_size in (128, 256) + + +def _get_tile_size( + head_size: int, + sliding_window: int, + element_size: int, + is_mm_prefix: bool, + is_prefill: bool, +) -> int: + """Select tile size with Gemma3-specific optimization. + + For Gemma3, use 32 for both prefill and decode to better utilize + the larger head dimension (128/256). For other models, use + the default vLLM behavior. + """ + if is_mm_prefix: + # Multimodal bidirectional attention needs a larger tile size + return 64 + + if _is_gemma3_attention(head_size, sliding_window): + # Gemma3: use 32 for decode (default is 16) + return 32 + + # Default behavior + if is_prefill: + return 32 + return 16 if element_size >= 2 else 32 + + def unified_attention( q, k, @@ -759,6 +850,8 @@ def unified_attention( qq_bias=None, # Optional tensor for sinks sinks=None, + # Optional tensor for prefix lengths (PrefixLM support) + mm_prefix_range=None, ): assert causal, "Only causal attention is supported" assert q_descale is None, "Q scales not supported" @@ -766,6 +859,17 @@ def unified_attention( if sinks is not None: assert sinks.shape[0] == q.shape[1], "Sinks must be num_query_heads size" + use_mm_prefix = False + max_mm_ranges = 0 + if mm_prefix_range is not None: + if mm_prefix_range.ndim == 3: + use_mm_prefix = True + max_mm_ranges = mm_prefix_range.shape[1] + else: + raise ValueError( + f"Unsupported mm_prefix_range shape: {mm_prefix_range.shape}" + ) + use_alibi_slopes = alibi_slopes is not None use_qq_bias = qq_bias is not None @@ -792,11 +896,23 @@ def unified_attention( # = floor(q.shape[0] / BLOCK_Q) + num_seqs total_num_q_blocks = q.shape[0] // BLOCK_Q + num_seqs - # Assigning default tile sizes for prefill and decode. - # Note: each tile size must be at least 32 for "fp8" (q.element_size() == 1) - # and at least 16 for all other data types. - TILE_SIZE_PREFILL = 32 - TILE_SIZE_DECODE = 16 if q.element_size() >= 2 else 32 + # Tile sizes for prefill and decode. Gemma3 models use optimized values. + # Note: tile size must be at least 32 for fp8 (element_size == 1). + sliding_window_val = 1 + window_size[0] if window_size[0] >= 0 else 0 + TILE_SIZE_PREFILL = _get_tile_size( + head_size, + sliding_window_val, + q.element_size(), + is_mm_prefix=use_mm_prefix, + is_prefill=True, + ) + TILE_SIZE_DECODE = _get_tile_size( + head_size, + sliding_window_val, + q.element_size(), + is_mm_prefix=use_mm_prefix, + is_prefill=False, + ) # Launch the 2D kernel if # 1. No intermediate tiled softmax buffers for the 3D kernel have been allocated, or @@ -847,6 +963,9 @@ def unified_attention( USE_QQ_BIAS=use_qq_bias, USE_SOFTCAP=(softcap > 0), USE_SINKS=(sinks is not None), + USE_MM_PREFIX=use_mm_prefix, + MAX_MM_RANGES=max_mm_ranges, + mm_prefix_range_ptr=mm_prefix_range, SLIDING_WINDOW=(1 + window_size[0]), stride_k_cache_0=k.stride(0), stride_k_cache_1=k.stride(1), @@ -895,6 +1014,9 @@ def unified_attention( USE_QQ_BIAS=use_qq_bias, USE_SOFTCAP=(softcap > 0), USE_SINKS=(sinks is not None), + USE_MM_PREFIX=use_mm_prefix, + MAX_MM_RANGES=max_mm_ranges, + mm_prefix_range_ptr=mm_prefix_range, SLIDING_WINDOW=(1 + window_size[0]), stride_k_cache_0=k.stride(0), stride_k_cache_1=k.stride(1), diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index 70f72b5cb9beb..e6a201c669e96 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -19,7 +19,6 @@ from collections.abc import Iterable from itertools import islice import torch -import torch.nn.functional as F from torch import nn from transformers import Gemma3TextConfig @@ -226,77 +225,9 @@ class Gemma3Attention(nn.Module): q, k = self.rotary_emb(positions, q, k) attn_output = self.attn(q, k, v) - - if not kwargs.get("has_images", False): - # Fast path for text-only inputs. The performance for the text-only - # inputs are not affected by the naive attention below. - output, _ = self.o_proj(attn_output) - return output - - # NOTE(woosuk): Gemma3 uses bidirectional attention between image tokens - # that correspond to the same image while using causal attention - # otherwise. Current attention backends cannot handle this pattern, so - # we temporarily use a naive attention implementation with mask tensors. - - # We intentionally keep the attention backend as-is and only override - # `attn_output` with the naive implementation's output. This minimizes - # changes to existing model runners and attention backends. The call to - # `self.attn(q, k, v)` is only used to populate the KV cache - its - # output is discarded and overwritten below. While this duplicates - # computation, it maintains compatibility. - # TODO(woosuk): Optimize by implementing custom attention kernels. - attn_output = self.naive_attn_with_masks(q, k, v, out=attn_output, **kwargs) output, _ = self.o_proj(attn_output) return output - def naive_attn_with_masks( - self, - q: torch.Tensor, - k: torch.Tensor, - v: torch.Tensor, - out: torch.Tensor, - **kwargs, - ) -> torch.Tensor: - # NOTE(woosuk): As described in the comment above, this code is not - # meant to be performant. It is only meant to be correct. - q = q.view(-1, self.num_heads, self.head_dim) - # Expand the key and value to handle GQA. - num_queries_per_kv = self.num_heads // self.num_kv_heads - k = k.view(-1, self.num_kv_heads, self.head_dim) - k = k.repeat_interleave(num_queries_per_kv, dim=-2) - v = v.view(-1, self.num_kv_heads, self.head_dim) - v = v.repeat_interleave(num_queries_per_kv, dim=-2) - - if self.is_sliding: - attn_masks = kwargs["local_attn_masks"] - else: - attn_masks = kwargs["global_attn_masks"] - - seq_lens = kwargs["seq_lens"] - start_idx = 0 - for seq_len, attn_mask in zip(seq_lens, attn_masks): - end_idx = start_idx + seq_len - query = q[start_idx:end_idx].unsqueeze(0) - key = k[start_idx:end_idx].unsqueeze(0) - value = v[start_idx:end_idx].unsqueeze(0) - - # Transpose. - query = query.transpose(1, 2) - key = key.transpose(1, 2) - value = value.transpose(1, 2) - - output = F.scaled_dot_product_attention( - query, - key, - value, - attn_mask, - self.scaling, - ) - output = output.transpose(1, 2).flatten(-2, -1) - out[start_idx:end_idx] = output - start_idx = end_idx - return out - class Gemma3DecoderLayer(nn.Module): def __init__( diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index 7bea3862a03f9..ca7be990ca555 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -76,6 +76,39 @@ class TritonAttentionMetadata: # Optional aot scheduling scheduler_metadata: torch.Tensor | None = None prefix_scheduler_metadata: torch.Tensor | None = None + mm_prefix_range: dict[int, list[tuple[int, int]]] | None = None + + @property + def mm_prefix_range_tensor(self) -> torch.Tensor | None: + """Convert mm_prefix_range dict to padded tensor for Triton kernel. + + Returns shape: (num_seqs, max_ranges, 2) with 0-padding for empty ranges. + Empty ranges have start==end==0, which kernel skips via is_valid check. + """ + # TODO(Isotr0py): Move to model runner's attention metadata + # preparation to avoid duplicate computation. + if self.mm_prefix_range is None: + return None + + num_seqs = self.seq_lens.shape[0] + device = self.seq_lens.device + + # Collect ranges, using [(0,0)] for empty sequences to ensure uniform dims + range_lists = [ + self.mm_prefix_range.get(i, [(0, 0)]) or [(0, 0)] for i in range(num_seqs) + ] + + # Return None if all ranges are trivial (only (0,0) placeholders) + if all(r == [(0, 0)] for r in range_lists): + return None + + # Create 2D tensors with shape (num_ranges, 2) for each sequence + range_tensors = [ + torch.tensor(r, dtype=torch.int32, device=device).view(-1, 2) + for r in range_lists + ] + + return torch.nested.nested_tensor(range_tensors).to_padded_tensor(0) class TritonAttentionMetadataBuilder(AttentionMetadataBuilder[TritonAttentionMetadata]): @@ -268,6 +301,10 @@ class TritonAttentionBackend(AttentionBackend): def supports_head_size(cls, head_size: int) -> bool: return head_size >= 32 + @classmethod + def supports_mm_prefix(cls) -> bool: + return True + @classmethod def supports_sink(cls) -> bool: return True @@ -427,6 +464,7 @@ class TritonAttentionImpl(AttentionImpl): softmax_segm_expsum = attn_metadata.softmax_segm_expsum descale_shape = (cu_seqlens_q.shape[0] - 1, key_cache.shape[2]) + mm_prefix_range_tensor = attn_metadata.mm_prefix_range_tensor unified_attention( q=query[:num_actual_tokens], @@ -453,6 +491,7 @@ class TritonAttentionImpl(AttentionImpl): softmax_segm_expsum=softmax_segm_expsum, sinks=self.sinks, output_scale=output_scale, + mm_prefix_range=mm_prefix_range_tensor, ) return output From ed2897f336b579cd6c1f5f6e48d2d5931804d315 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Wed, 17 Dec 2025 19:46:44 -0500 Subject: [PATCH 036/176] [CI][Feature] Adds auto-rebase PR rule (#30875) Signed-off-by: Rafael Vasquez Co-authored-by: Kevin H. Luu --- .github/mergify.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.github/mergify.yml b/.github/mergify.yml index 3e4e21efe39df..61a03135be395 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -349,6 +349,18 @@ pull_request_rules: add: - tool-calling +- name: auto-rebase if approved, ready, and 40 commits behind main + conditions: + - base = main + - label=ready + - "#approved-reviews-by >= 1" + - "#commits-behind >= 40" + - -closed + - -draft + - -conflict + actions: + rebase: {} + - name: ping author on conflicts and add 'needs-rebase' label conditions: - label != stale From a0b782f9ccd02add3516f074de163c9847686066 Mon Sep 17 00:00:00 2001 From: SungMinCho Date: Wed, 17 Dec 2025 17:40:51 -0800 Subject: [PATCH 037/176] [Metrics] Model FLOPs Utilization estimation (#30738) Signed-off-by: SungMinCho Signed-off-by: Mark McLoughlin Co-authored-by: Mark McLoughlin --- tests/v1/metrics/test_perf_metrics.py | 897 ++++++++++++++++++ vllm/config/observability.py | 3 + vllm/engine/arg_utils.py | 6 + vllm/envs.py | 5 + vllm/v1/core/sched/scheduler.py | 13 +- vllm/v1/metrics/loggers.py | 17 +- vllm/v1/metrics/perf.py | 1244 +++++++++++++++++++++++++ vllm/v1/metrics/stats.py | 3 + 8 files changed, 2186 insertions(+), 2 deletions(-) create mode 100644 tests/v1/metrics/test_perf_metrics.py create mode 100644 vllm/v1/metrics/perf.py diff --git a/tests/v1/metrics/test_perf_metrics.py b/tests/v1/metrics/test_perf_metrics.py new file mode 100644 index 0000000000000..b6cda7bef3d41 --- /dev/null +++ b/tests/v1/metrics/test_perf_metrics.py @@ -0,0 +1,897 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Tests for the analytic estimators in metrics/flops.py. +""" + +import types +from types import SimpleNamespace + +from transformers.models.deepseek_v3.configuration_deepseek_v3 import DeepseekV3Config +from transformers.models.llama4.configuration_llama4 import ( + Llama4Config, + Llama4TextConfig, +) +from transformers.models.qwen3.configuration_qwen3 import Qwen3Config +from transformers.models.qwen3_moe.configuration_qwen3_moe import Qwen3MoeConfig + +from vllm.config.model import ModelConfig, get_hf_text_config +from vllm.v1.metrics.perf import ( + AttentionMetrics, + BaseConfigParser, + ExecutionContext, + FfnMetrics, + ModelMetrics, + ParsedArgs, + UnembedMetrics, +) + + +class MockModelConfig: + """Mock ModelConfig that implements the getter methods used by parsers.""" + + def __init__(self, hf_config, dtype): + self.hf_config = hf_config + self.hf_text_config = get_hf_text_config(hf_config) + self.dtype = dtype + self.is_attention_free = False + + def __getattr__(self, name): + # 1. Check if ModelConfig actually has this attribute + if not hasattr(ModelConfig, name): + raise AttributeError( + f"'{type(self).__name__}' object has no attribute '{name}' " + f"and neither does 'ModelConfig'." + ) + + # 2. Fetch the attribute from the ModelConfig CLASS + attr = getattr(ModelConfig, name) + + # 3. Case A: It is a @property + if isinstance(attr, property): + # Manually invoke the property's getter, passing 'self' (this mock instance) + return attr.__get__(self, self.__class__) + + # 4. Case B: It is a standard method (function) + if isinstance(attr, types.FunctionType): + # Bind the function to 'self' so it acts like a method of + # this instance. This creates a bound method where 'self' is + # automatically passed as the first arg. + return types.MethodType(attr, self) + + # 5. Case C: It is a class attribute / static variable + return attr + + +def create_mock_vllm_config( + hf_config, + model_dtype="bfloat16", + cache_dtype="auto", + quant_config=None, + data_parallel_size=1, + tensor_parallel_size=1, + pipeline_parallel_size=1, + enable_expert_parallel=False, +) -> SimpleNamespace: + vllm_config = SimpleNamespace() + vllm_config.model_config = MockModelConfig(hf_config, model_dtype) + + vllm_config.cache_config = SimpleNamespace() + vllm_config.cache_config.cache_dtype = cache_dtype + + vllm_config.quant_config = quant_config + + vllm_config.parallel_config = SimpleNamespace() + vllm_config.parallel_config.data_parallel_size = data_parallel_size + vllm_config.parallel_config.tensor_parallel_size = tensor_parallel_size + vllm_config.parallel_config.pipeline_parallel_size = pipeline_parallel_size + vllm_config.parallel_config.enable_expert_parallel = enable_expert_parallel + + return vllm_config + + +#### Parser Tests #### + + +def test_base_config_parser(): + """Test BaseConfigParser extracts base model attributes correctly.""" + hf_config = Qwen3Config( + vocab_size=50000, + hidden_size=2048, + num_attention_heads=16, + num_hidden_layers=24, + ) + vllm_config = create_mock_vllm_config(hf_config, model_dtype="float16") + + parser = BaseConfigParser() + args = ParsedArgs() + result = parser.parse(args, vllm_config) + + assert result.vocab_size == 50000 + assert result.hidden_size == 2048 + assert result.num_attention_heads == 16 + assert result.num_hidden_layers == 24 + assert result.weight_byte_size == 2 # float16 is 2 bytes + assert result.activation_byte_size == 2 # default activation size + + +def test_base_attention_config_parser_with_gqa(): + """Test BaseAttentionConfigParser with grouped query attention.""" + hf_config = Qwen3Config( + hidden_size=4096, + num_attention_heads=32, + num_key_value_heads=8, # GQA with 4:1 ratio + head_dim=128, + ) + vllm_config = create_mock_vllm_config(hf_config) + + parser_chain = AttentionMetrics.get_parser() + result = parser_chain.parse(vllm_config) + + assert result.num_key_value_heads == 8 + assert result.head_dim == 128 + + +def test_base_attention_config_parser_without_gqa(): + """ + Test BaseAttentionConfigParser defaults to MHA when num_key_value_heads not + specified. + """ + hf_config = Qwen3Config( + hidden_size=4096, + num_attention_heads=32, + # No num_key_value_heads specified + ) + vllm_config = create_mock_vllm_config(hf_config) + + parser_chain = AttentionMetrics.get_parser() + result = parser_chain.parse(vllm_config) + + # Should default to MHA (num_key_value_heads = num_attention_heads) + assert result.num_key_value_heads == 32 + + +def test_base_ffn_config_parser_dense(): + """Test BaseFfnConfigParser for dense FFN.""" + hf_config = Qwen3Config( + hidden_size=4096, + intermediate_size=11008, + num_hidden_layers=32, + ) + vllm_config = create_mock_vllm_config(hf_config) + + parser_chain = FfnMetrics.get_parser() + result = parser_chain.parse(vllm_config) + + assert result.intermediate_size == 11008 + assert result.num_experts == 0 + assert result.num_experts_per_tok == 0 + assert result.num_moe_layers == 0 # No MoE + + +def test_base_ffn_config_parser_moe(): + """Test BaseFfnConfigParser for MoE FFN.""" + hf_config = Qwen3MoeConfig( + hidden_size=4096, + intermediate_size=11008, + num_hidden_layers=32, + num_experts=64, + num_experts_per_tok=8, + moe_intermediate_size=14336, + n_shared_experts=2, + ) + vllm_config = create_mock_vllm_config(hf_config) + + parser_chain = FfnMetrics.get_parser() + result = parser_chain.parse(vllm_config) + + assert result.num_experts == 64 + assert result.num_experts_per_tok == 8 + assert result.moe_intermediate_size == 14336 + assert result.num_shared_experts == 2 + assert result.num_moe_layers == 32 # All layers are MoE by default + + +def test_interleave_moe_layer_step_parser(): + """Test InterleaveMoeLayerStepParser correctly computes MoE layer count.""" + hf_config = Llama4Config( + text_config=Llama4TextConfig( + num_hidden_layers=32, + num_local_experts=64, + interleave_moe_layer_step=4, # Every 4th layer is MoE + ), + ) + + vllm_config = create_mock_vllm_config(hf_config) + + parser_chain = FfnMetrics.get_parser() + result = parser_chain.parse(vllm_config) + + assert result.num_moe_layers == 8 + + +def test_moe_layer_freq_parser(): + """Test MoeLayerFreqParser correctly computes MoE layer count.""" + hf_config = DeepseekV3Config( + num_hidden_layers=30, + n_routed_experts=64, + moe_layer_freq=3, # Every 3rd layer after first_k_dense_replace + first_k_dense_replace=6, # First 6 layers are dense + ) + vllm_config = create_mock_vllm_config(hf_config) + + parser_chain = FfnMetrics.get_parser() + result = parser_chain.parse(vllm_config) + + # Layers >= 6 and divisible by 3: 6, 9, 12, 15, 18, 21, 24, 27 + expected_moe_layers = len( + [layer for layer in range(30) if layer >= 6 and layer % 3 == 0] + ) + assert expected_moe_layers == 8 + assert result.num_moe_layers == expected_moe_layers + + +#### ComponentMetrics Tests #### + + +def test_attention_metrics_scaling(): + """Test that attention metrics scale proportionally with model dimensions.""" + base_hf_config = Qwen3Config( + hidden_size=2048, + num_attention_heads=16, + num_key_value_heads=16, + num_hidden_layers=12, + head_dim=128, + ) + + base_vllm_config = create_mock_vllm_config(base_hf_config) + base_metrics = AttentionMetrics.from_vllm_config(base_vllm_config) + + # Test scaling with number of layers + double_layers_hf_config = Qwen3Config( + hidden_size=2048, + num_attention_heads=16, + num_key_value_heads=16, + num_hidden_layers=24, # Double the layers + head_dim=128, + ) + double_layers_vllm_config = create_mock_vllm_config(double_layers_hf_config) + double_layers_metrics = AttentionMetrics.from_vllm_config(double_layers_vllm_config) + + ctx = ExecutionContext.from_single_request( + num_tokens=100, context_len=512, is_prefill=True + ) + + # FLOPS should double when layers double + base_flops = base_metrics.get_num_flops(ctx) + double_flops = double_layers_metrics.get_num_flops(ctx) + assert double_flops == 2 * base_flops + + # Read/write bytes should also scale proportionally + base_read = base_metrics.get_read_bytes(ctx) + double_read = double_layers_metrics.get_read_bytes(ctx) + assert double_read == 2 * base_read + + base_write = base_metrics.get_write_bytes(ctx) + double_write = double_layers_metrics.get_write_bytes(ctx) + assert double_write == 2 * base_write + + +def test_attention_metrics_grouped_query(): + """Test attention metrics handle grouped query attention correctly.""" + mha_hf_config = Qwen3Config( + hidden_size=4096, + num_attention_heads=32, + num_key_value_heads=32, # MHA + num_hidden_layers=1, + ) + mha_config = create_mock_vllm_config(mha_hf_config) + + gqa_hf_config = Qwen3Config( + hidden_size=4096, + num_attention_heads=32, + num_key_value_heads=8, # GQA with 4:1 ratio + num_hidden_layers=1, + ) + gqa_config = create_mock_vllm_config(gqa_hf_config) + + mha_metrics = AttentionMetrics.from_vllm_config(mha_config) + gqa_metrics = AttentionMetrics.from_vllm_config(gqa_config) + + ctx = ExecutionContext.from_single_request( + num_tokens=1, context_len=1024, is_prefill=False + ) + + # GQA should have less KV cache reads since fewer KV heads + mha_read = mha_metrics.get_read_bytes(ctx) + gqa_read = gqa_metrics.get_read_bytes(ctx) + assert gqa_read < mha_read + + +def test_ffn_metrics_scaling(): + """Test FFN metrics scale proportionally with model dimensions.""" + base_hf_config = Qwen3Config( + hidden_size=2048, + intermediate_size=8192, + num_hidden_layers=12, + ) + base_vllm_config = create_mock_vllm_config(base_hf_config) + base_metrics = FfnMetrics.from_vllm_config(base_vllm_config) + + # Test scaling with intermediate size + larger_ffn_hf_config = Qwen3Config( + hidden_size=2048, + intermediate_size=16384, # Double intermediate size + num_hidden_layers=12, + ) + larger_ffn_vllm_config = create_mock_vllm_config(larger_ffn_hf_config) + larger_ffn_metrics = FfnMetrics.from_vllm_config(larger_ffn_vllm_config) + + ctx = ExecutionContext.from_single_request( + num_tokens=100, context_len=512, is_prefill=True + ) + + # FLOPS should double when intermediate size doubles + base_flops = base_metrics.get_num_flops(ctx) + larger_flops = larger_ffn_metrics.get_num_flops(ctx) + assert larger_flops == base_flops * 2 + + +def test_moe_metrics_vs_dense(): + """Test MoE metrics versus dense metrics.""" + dense_hf_config = Qwen3Config( + hidden_size=2048, + intermediate_size=8192, + num_hidden_layers=12, + ) + dense_config = create_mock_vllm_config(dense_hf_config) + + moe_hf_config = Qwen3MoeConfig( + hidden_size=2048, + intermediate_size=8192, + num_hidden_layers=12, + num_experts=64, + num_experts_per_tok=2, # 2 routed expert + moe_intermediate_size=8192, + n_shared_experts=0, + ) + moe_config = create_mock_vllm_config(moe_hf_config) + + dense_metrics = FfnMetrics.from_vllm_config(dense_config) + moe_metrics = FfnMetrics.from_vllm_config(moe_config) + + ctx = ExecutionContext.from_single_request( + num_tokens=100, context_len=512, is_prefill=True + ) + + # MoE should have different compute/memory characteristics + dense_flops = dense_metrics.get_num_flops(ctx) + moe_flops = moe_metrics.get_num_flops(ctx) + + # 2 routed experts vs 1 dense. + assert moe_flops == dense_flops * 2 + + +def test_unembed_metrics_scaling(): + """Test unembedding metrics scale with vocab size.""" + small_vocab_hf_config = Qwen3Config( + hidden_size=2048, + vocab_size=32000, + ) + small_vocab_config = create_mock_vllm_config(small_vocab_hf_config) + + large_vocab_hf_config = Qwen3Config( + hidden_size=2048, + vocab_size=64000, # Double vocab size + ) + large_vocab_config = create_mock_vllm_config(large_vocab_hf_config) + + small_vocab_metrics = UnembedMetrics.from_vllm_config(small_vocab_config) + large_vocab_metrics = UnembedMetrics.from_vllm_config(large_vocab_config) + + ctx = ExecutionContext.from_single_request( + num_tokens=100, context_len=512, is_prefill=True + ) + + # FLOPS should double when vocab size doubles + small_flops = small_vocab_metrics.get_num_flops(ctx) + large_flops = large_vocab_metrics.get_num_flops(ctx) + assert large_flops == 2 * small_flops + + +def test_prefill_vs_decode_differences(): + """Test that prefill and decode have different memory access patterns.""" + hf_config = Qwen3Config( + hidden_size=2048, + num_attention_heads=16, + num_key_value_heads=16, + num_hidden_layers=1, + ) + config = create_mock_vllm_config(hf_config) + + metrics = AttentionMetrics.from_vllm_config(config) + + prefill_ctx = ExecutionContext.from_single_request( + num_tokens=512, context_len=512, is_prefill=True + ) + decode_ctx = ExecutionContext.from_single_request( + num_tokens=1, context_len=512, is_prefill=False + ) + + prefill_read = metrics.get_read_bytes(prefill_ctx) + decode_read = metrics.get_read_bytes(decode_ctx) + + assert prefill_read != decode_read + + +def test_model_metrics_aggregation(): + """Test ModelMetrics correctly aggregates across components.""" + hf_config = Qwen3Config( + hidden_size=2048, + num_attention_heads=16, + num_hidden_layers=12, + vocab_size=32000, + intermediate_size=8192, + ) + config = create_mock_vllm_config(hf_config) + + model_metrics = ModelMetrics(config) + ctx = ExecutionContext.from_single_request( + num_tokens=100, context_len=512, is_prefill=True + ) + + # Should have metrics for attention, ffn, and unembed + total_flops = model_metrics.get_num_flops(ctx) + breakdown = model_metrics.get_num_flops_breakdown(ctx) + + # Breakdown should sum to total + assert total_flops == sum(breakdown.values()) + + +def test_moe_expert_activation_proportional_scaling(): + """Test that routed expert metrics scale proportionally with num_experts_per_tok.""" + base_moe_config = Qwen3MoeConfig( + hidden_size=2048, + intermediate_size=8192, + num_hidden_layers=12, + num_experts=64, + num_experts_per_tok=1, # 1 expert per token + moe_intermediate_size=8192, + n_shared_experts=2, + ) + + double_experts_config = Qwen3MoeConfig( + hidden_size=2048, + intermediate_size=8192, + num_hidden_layers=12, + num_experts=64, + num_experts_per_tok=2, # 2 experts per token (double) + moe_intermediate_size=8192, + n_shared_experts=2, # Same shared experts + ) + + triple_experts_config = Qwen3MoeConfig( + hidden_size=2048, + intermediate_size=8192, + num_hidden_layers=12, + num_experts=64, + num_experts_per_tok=3, # 3 experts per token (triple) + moe_intermediate_size=8192, + n_shared_experts=2, # Same shared experts + ) + + base_vllm_config = create_mock_vllm_config(base_moe_config) + double_vllm_config = create_mock_vllm_config(double_experts_config) + triple_vllm_config = create_mock_vllm_config(triple_experts_config) + + base_metrics = FfnMetrics.from_vllm_config(base_vllm_config) + double_metrics = FfnMetrics.from_vllm_config(double_vllm_config) + triple_metrics = FfnMetrics.from_vllm_config(triple_vllm_config) + + ctx = ExecutionContext.from_single_request( + num_tokens=100, context_len=512, is_prefill=True + ) + + # Get total metrics - the key insight is that differences should be proportional + base_flops = base_metrics.get_num_flops(ctx) + double_flops = double_metrics.get_num_flops(ctx) + triple_flops = triple_metrics.get_num_flops(ctx) + + # The difference between double and base should equal one additional expert + one_expert_diff = double_flops - base_flops + + # The difference between triple and base should equal two additional experts + two_expert_diff = triple_flops - base_flops + + # Proportional scaling: 2 * (1 expert diff) should equal (2 expert diff) + assert two_expert_diff == 2 * one_expert_diff + + # Same logic applies to memory operations + base_read = base_metrics.get_read_bytes(ctx) + double_read = double_metrics.get_read_bytes(ctx) + triple_read = triple_metrics.get_read_bytes(ctx) + + one_expert_read_diff = double_read - base_read + two_expert_read_diff = triple_read - base_read + + assert two_expert_read_diff == 2 * one_expert_read_diff + + # Same for write bytes + base_write = base_metrics.get_write_bytes(ctx) + double_write = double_metrics.get_write_bytes(ctx) + triple_write = triple_metrics.get_write_bytes(ctx) + + one_expert_write_diff = double_write - base_write + two_expert_write_diff = triple_write - base_write + + assert two_expert_write_diff == 2 * one_expert_write_diff + + +def test_quantization_config_parser_fp8(): + """Test quantization parsers with fp8.""" + + class MockQuantConfig: + def get_name(self): + return "fp8" + + hf_config = Qwen3Config( + hidden_size=2048, num_attention_heads=16, num_hidden_layers=1 + ) + vllm_config = create_mock_vllm_config(hf_config, quant_config=MockQuantConfig()) + + attn_result = AttentionMetrics.get_parser().parse(vllm_config) + assert attn_result.weight_byte_size == 1 # fp8 + + ffn_result = FfnMetrics.get_parser().parse(vllm_config) + assert ffn_result.weight_byte_size == 1 # fp8 + + +def test_quantization_config_parser_mxfp4(): + """Test quantization parsers with mxfp4.""" + + class MockQuantConfig: + def get_name(self): + return "mxfp4" + + hf_config = Qwen3Config( + hidden_size=2048, intermediate_size=8192, num_hidden_layers=1 + ) + vllm_config = create_mock_vllm_config(hf_config, quant_config=MockQuantConfig()) + + ffn_result = FfnMetrics.get_parser().parse(vllm_config) + assert ffn_result.weight_byte_size == 0.5 # mxfp4 + + +#### Per-GPU Tests #### + + +def test_attention_per_gpu_with_tensor_parallelism(): + """Test attention metrics with tensor parallelism - per_gpu vs global.""" + hf_config = Qwen3Config( + hidden_size=4096, + num_attention_heads=32, + num_key_value_heads=8, + num_hidden_layers=24, + ) + + # Test with TP=4 + vllm_config = create_mock_vllm_config(hf_config, tensor_parallel_size=4) + metrics = AttentionMetrics.from_vllm_config(vllm_config) + + ctx = ExecutionContext.from_single_request( + num_tokens=128, context_len=1024, is_prefill=True + ) + + # Get global and per-gpu metrics + global_flops = metrics.get_num_flops(ctx, per_gpu=False) + per_gpu_flops = metrics.get_num_flops(ctx, per_gpu=True) + + # With TP=4, global flops should be 4x per-gpu flops (heads divided by 4) + assert global_flops == 4 * per_gpu_flops + + # Same for read/write bytes + global_read = metrics.get_read_bytes(ctx, per_gpu=False) + per_gpu_read = metrics.get_read_bytes(ctx, per_gpu=True) + # Reads should scale similarly (weight reads are divided by TP) + assert global_read > per_gpu_read + + global_write = metrics.get_write_bytes(ctx, per_gpu=False) + per_gpu_write = metrics.get_write_bytes(ctx, per_gpu=True) + assert global_write > per_gpu_write + + +def test_attention_per_gpu_with_pipeline_parallelism(): + """Test attention metrics with pipeline parallelism - per_gpu vs global.""" + hf_config = Qwen3Config( + hidden_size=2048, + num_attention_heads=16, + num_hidden_layers=32, + ) + + # Test with PP=4 + vllm_config = create_mock_vllm_config(hf_config, pipeline_parallel_size=4) + metrics = AttentionMetrics.from_vllm_config(vllm_config) + + ctx = ExecutionContext.from_single_request( + num_tokens=100, context_len=512, is_prefill=False + ) + + # Get global and per-gpu metrics + global_flops = metrics.get_num_flops(ctx, per_gpu=False) + per_gpu_flops = metrics.get_num_flops(ctx, per_gpu=True) + + # With PP=4, global flops should be 4x per-gpu flops (layers divided by 4) + assert global_flops == 4 * per_gpu_flops + + global_read = metrics.get_read_bytes(ctx, per_gpu=False) + per_gpu_read = metrics.get_read_bytes(ctx, per_gpu=True) + assert global_read == 4 * per_gpu_read + + +def test_ffn_per_gpu_with_tensor_parallelism(): + """Test FFN metrics with tensor parallelism - per_gpu vs global.""" + hf_config = Qwen3Config( + hidden_size=4096, + intermediate_size=14336, + num_hidden_layers=32, + ) + + # Test with DP=2, TP=4 (ffn_tp_size will be 8) + vllm_config = create_mock_vllm_config( + hf_config, + data_parallel_size=2, + tensor_parallel_size=4, + ) + metrics = FfnMetrics.from_vllm_config(vllm_config) + + # ffn_tp_size should be dp_size * tp_size = 8 (when EP not enabled) + assert metrics.ffn_tp_size == 8 + + ctx = ExecutionContext.from_single_request( + num_tokens=128, context_len=2048, is_prefill=True + ) + + # Get global and per-gpu metrics + global_flops = metrics.get_num_flops(ctx, per_gpu=False) + per_gpu_flops = metrics.get_num_flops(ctx, per_gpu=True) + + # With ffn_tp_size=8, global should be 8x per-gpu + assert global_flops == 8 * per_gpu_flops + + +def test_ffn_per_gpu_with_pipeline_parallelism(): + """Test FFN metrics with pipeline parallelism - per_gpu vs global.""" + hf_config = Qwen3Config( + hidden_size=2048, + intermediate_size=8192, + num_hidden_layers=24, + ) + + # Test with PP=6 + vllm_config = create_mock_vllm_config(hf_config, pipeline_parallel_size=6) + metrics = FfnMetrics.from_vllm_config(vllm_config) + + ctx = ExecutionContext.from_single_request( + num_tokens=100, context_len=512, is_prefill=True + ) + + # Get global and per-gpu metrics + global_flops = metrics.get_num_flops(ctx, per_gpu=False) + per_gpu_flops = metrics.get_num_flops(ctx, per_gpu=True) + + # With PP=6, global should be 6x per-gpu (layers divided by 6) + assert global_flops == 6 * per_gpu_flops + + +def test_moe_per_gpu_with_expert_parallelism(): + """ + Test MoE metrics with expert parallelism - verifies num_activated_experts bug fix. + """ + hf_config = Qwen3MoeConfig( + hidden_size=2048, + intermediate_size=8192, + num_hidden_layers=24, + num_experts=64, + num_experts_per_tok=8, + moe_intermediate_size=14336, + n_shared_experts=2, + ) + + # Test with DP=2, TP=4, EP enabled (ffn_ep_size will be 8) + vllm_config = create_mock_vllm_config( + hf_config, + data_parallel_size=2, + tensor_parallel_size=4, + enable_expert_parallel=True, + ) + metrics = FfnMetrics.from_vllm_config(vllm_config) + + # When EP enabled, ffn_ep_size = dp_size * tp_size = 8 + assert metrics.ffn_ep_size == 8 + assert metrics.ffn_tp_size == 1 + + ctx = ExecutionContext.from_single_request( + num_tokens=100, context_len=512, is_prefill=True + ) + + # Get per-gpu metrics + per_gpu_read_breakdown = metrics.get_read_bytes_breakdown(ctx, per_gpu=True) + global_read_breakdown = metrics.get_read_bytes_breakdown(ctx, per_gpu=False) + + # Verify that routed expert weight reads are reasonable + # With per_gpu=True, each GPU has 64/8 = 8 experts + # T=100, E_per_gpu=8/8=1, so T*E=100 expert activations + # num_activated_experts should be min(100, 8) = 8 + + # Check that weight reads scale appropriately + # Global has all 64 experts, per-gpu has 8 experts + # So weight reads should reflect this difference + if "routed_up_gate_weights" in per_gpu_read_breakdown: + per_gpu_weight_reads = per_gpu_read_breakdown["routed_up_gate_weights"] + global_weight_reads = global_read_breakdown["routed_up_gate_weights"] + + # The ratio should reflect the expert count difference + # This verifies the bug fix works correctly + assert per_gpu_weight_reads < global_weight_reads + + # Global should read more experts than per-gpu + # Exact ratio depends on num_activated_experts calculation + ratio = global_weight_reads / per_gpu_weight_reads + # Should be > 1 since global has more experts to read + assert ratio > 1 + + +def test_moe_per_gpu_expert_activation_accounting(): + """ + Test that MoE correctly accounts for expert activations with small batch sizes. + """ + hf_config = Qwen3MoeConfig( + hidden_size=2048, + intermediate_size=8192, + num_hidden_layers=12, + num_experts=64, + num_experts_per_tok=8, + moe_intermediate_size=14336, + n_shared_experts=0, # No shared experts for this test + ) + + # Test with EP=8 + vllm_config = create_mock_vllm_config( + hf_config, + data_parallel_size=8, + enable_expert_parallel=True, + ) + metrics = FfnMetrics.from_vllm_config(vllm_config) + + # Small batch: T=10, E_per_gpu=8/8=1 + # Each GPU: T*E = 10*1 = 10 activations + # Experts per GPU: 64/8 = 8 + # So num_activated_experts should be min(10, 8) = 8 + small_ctx = ExecutionContext.from_single_request( + num_tokens=10, context_len=512, is_prefill=True + ) + small_read = metrics.get_read_bytes_breakdown(small_ctx, per_gpu=True) + + # Large batch: T=1000, E_per_gpu=1 + # Each GPU: T*E = 1000*1 = 1000 activations + # Experts per GPU: 8 + # So num_activated_experts should be min(1000, 8) = 8 (all experts activated) + large_ctx = ExecutionContext.from_single_request( + num_tokens=1000, context_len=512, is_prefill=True + ) + large_read = metrics.get_read_bytes_breakdown(large_ctx, per_gpu=True) + + # Weight reads should be similar (both activate all 8 experts per GPU) + # But activation reads should differ (proportional to T*E) + if "routed_up_gate_weights" in small_read: + small_weight = small_read["routed_up_gate_weights"] + large_weight = large_read["routed_up_gate_weights"] + + # Weight reads should be the same (both read all 8 experts) + assert small_weight == large_weight + + # But input activation reads should scale with T*E + small_input = small_read["routed_up_gate_input"] + large_input = large_read["routed_up_gate_input"] + assert large_input == 100 * small_input # 1000/10 = 100x + + +def test_unembed_per_gpu_with_tensor_parallelism(): + """Test unembed metrics with tensor parallelism - per_gpu vs global.""" + hf_config = Qwen3Config( + hidden_size=4096, + vocab_size=128000, + ) + + # Test with TP=8 + vllm_config = create_mock_vllm_config(hf_config, tensor_parallel_size=8) + metrics = UnembedMetrics.from_vllm_config(vllm_config) + + ctx = ExecutionContext.from_single_request( + num_tokens=100, context_len=512, is_prefill=True + ) + + # Get global and per-gpu metrics + global_flops = metrics.get_num_flops(ctx, per_gpu=False) + per_gpu_flops = metrics.get_num_flops(ctx, per_gpu=True) + + # With TP=8, vocab is divided by 8, so global should be 8x per-gpu + assert global_flops == 8 * per_gpu_flops + + # For read bytes, weight reads scale with TP but input reads don't (replicated) + global_read_breakdown = metrics.get_read_bytes_breakdown(ctx, per_gpu=False) + per_gpu_read_breakdown = metrics.get_read_bytes_breakdown(ctx, per_gpu=True) + + # Input reads should be the same (replicated across TP ranks) + assert global_read_breakdown["input"] == per_gpu_read_breakdown["input"] + + # Weight reads should scale 8x (divided by TP) + assert global_read_breakdown["weight"] == 8 * per_gpu_read_breakdown["weight"] + + +def test_model_metrics_per_gpu_aggregation(): + """Test ModelMetrics correctly aggregates per_gpu metrics across components.""" + hf_config = Qwen3Config( + hidden_size=2048, + num_attention_heads=16, + num_hidden_layers=12, + vocab_size=32000, + intermediate_size=8192, + ) + + # Test with mixed parallelism: TP=2, PP=2 + vllm_config = create_mock_vllm_config( + hf_config, + tensor_parallel_size=2, + pipeline_parallel_size=2, + ) + + model_metrics = ModelMetrics(vllm_config) + ctx = ExecutionContext.from_single_request( + num_tokens=100, context_len=512, is_prefill=True + ) + + # Get breakdowns for both modes + per_gpu_breakdown = model_metrics.get_num_flops_breakdown(ctx, per_gpu=True) + global_breakdown = model_metrics.get_num_flops_breakdown(ctx, per_gpu=False) + + # Verify breakdown sums match totals + per_gpu_total = model_metrics.get_num_flops(ctx, per_gpu=True) + global_total = model_metrics.get_num_flops(ctx, per_gpu=False) + + assert per_gpu_total == sum(per_gpu_breakdown.values()) + assert global_total == sum(global_breakdown.values()) + + # Global should be larger than per-gpu due to parallelism + assert global_total > per_gpu_total + + # With TP=2 and PP=2, the ratio depends on which parallelism applies to + # which component but we can verify that global is reasonably larger + ratio = global_total / per_gpu_total + assert ratio > 1 # Should be between PP and TP*PP depending on component mix + + +def test_attention_per_gpu_heads_not_evenly_divisible(): + """Test attention with heads not evenly divisible by TP.""" + hf_config = Qwen3Config( + hidden_size=2048, + num_attention_heads=17, # Not divisible by 4 + num_key_value_heads=5, # Not divisible by 4 + num_hidden_layers=8, + ) + + vllm_config = create_mock_vllm_config(hf_config, tensor_parallel_size=4) + metrics = AttentionMetrics.from_vllm_config(vllm_config) + + ctx = ExecutionContext.from_single_request( + num_tokens=64, context_len=256, is_prefill=True + ) + + # Should not crash and should handle max(1, ...) correctly + per_gpu_flops = metrics.get_num_flops(ctx, per_gpu=True) + global_flops = metrics.get_num_flops(ctx, per_gpu=False) + + # Both should be positive + assert per_gpu_flops > 0 + assert global_flops > 0 + assert global_flops > per_gpu_flops diff --git a/vllm/config/observability.py b/vllm/config/observability.py index e40bf18a00ce2..4aca6b15684ac 100644 --- a/vllm/config/observability.py +++ b/vllm/config/observability.py @@ -64,6 +64,9 @@ class ObservabilityConfig: module in the model and attach informations such as input/output shapes to nvtx range markers. Noted that this doesn't work with CUDA graphs enabled.""" + enable_mfu_metrics: bool = False + """Enable Model FLOPs Utilization (MFU) metrics.""" + @cached_property def collect_model_forward_time(self) -> bool: """Whether to collect model forward time for the request.""" diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 03720bd2516d4..64510bdcaf8a8 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -523,6 +523,7 @@ class EngineArgs: enable_layerwise_nvtx_tracing: bool = ( ObservabilityConfig.enable_layerwise_nvtx_tracing ) + enable_mfu_metrics: bool = ObservabilityConfig.enable_mfu_metrics scheduling_policy: SchedulerPolicy = SchedulerConfig.policy scheduler_cls: str | type[object] | None = SchedulerConfig.scheduler_cls @@ -1042,6 +1043,10 @@ class EngineArgs: "--enable-layerwise-nvtx-tracing", **observability_kwargs["enable_layerwise_nvtx_tracing"], ) + observability_group.add_argument( + "--enable-mfu-metrics", + **observability_kwargs["enable_mfu_metrics"], + ) # Scheduler arguments scheduler_kwargs = get_kwargs(SchedulerConfig) @@ -1689,6 +1694,7 @@ class EngineArgs: kv_cache_metrics_sample=self.kv_cache_metrics_sample, cudagraph_metrics=self.cudagraph_metrics, enable_layerwise_nvtx_tracing=self.enable_layerwise_nvtx_tracing, + enable_mfu_metrics=self.enable_mfu_metrics, ) # Compilation config overrides diff --git a/vllm/envs.py b/vllm/envs.py index 2f8158d88d6c5..b59991aa6523a 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -244,6 +244,7 @@ if TYPE_CHECKING: VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD: int = 256 VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary" VLLM_USE_V2_MODEL_RUNNER: bool = False + VLLM_DEBUG_MFU_METRICS: bool = False def get_default_cache_root(): @@ -1565,6 +1566,10 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_USE_V2_MODEL_RUNNER": lambda: bool( int(os.getenv("VLLM_USE_V2_MODEL_RUNNER", "0")) ), + # Debug logging for --enable-mfu-metrics + "VLLM_DEBUG_MFU_METRICS": lambda: bool( + int(os.getenv("VLLM_DEBUG_MFU_METRICS", "0")) + ), } # --8<-- [end:env-vars-definition] diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 8e835ad096405..da8339558b143 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -43,6 +43,7 @@ from vllm.v1.core.sched.request_queue import SchedulingPolicy, create_request_qu from vllm.v1.core.sched.utils import check_stop, remove_all from vllm.v1.engine import EngineCoreEventType, EngineCoreOutput, EngineCoreOutputs from vllm.v1.kv_cache_interface import KVCacheConfig +from vllm.v1.metrics.perf import ModelMetrics, PerfStats from vllm.v1.metrics.stats import ( PrefixCacheStats, SchedulerStats, @@ -219,6 +220,10 @@ class Scheduler(SchedulerInterface): self.use_pp = self.parallel_config.pipeline_parallel_size > 1 self.use_v2_model_runner = envs.VLLM_USE_V2_MODEL_RUNNER + self.perf_metrics: ModelMetrics | None = None + if self.log_stats and vllm_config.observability_config.enable_mfu_metrics: + self.perf_metrics = ModelMetrics(vllm_config) + def schedule(self) -> SchedulerOutput: # NOTE(woosuk) on the scheduling algorithm: # There's no "decoding phase" nor "prefill phase" in the scheduler. @@ -1066,6 +1071,10 @@ class Scheduler(SchedulerInterface): kv_connector_output = model_runner_output.kv_connector_output cudagraph_stats = model_runner_output.cudagraph_stats + perf_stats: PerfStats | None = None + if self.perf_metrics and self.perf_metrics.is_enabled(): + perf_stats = self.perf_metrics.get_step_perf_stats_per_gpu(scheduler_output) + outputs: dict[int, list[EngineCoreOutput]] = defaultdict(list) spec_decoding_stats: SpecDecodingStats | None = None kv_connector_stats: KVConnectorStats | None = ( @@ -1262,7 +1271,7 @@ class Scheduler(SchedulerInterface): if ( stats := self.make_stats( - spec_decoding_stats, kv_connector_stats, cudagraph_stats + spec_decoding_stats, kv_connector_stats, cudagraph_stats, perf_stats ) ) is not None: # Return stats to only one of the front-ends. @@ -1485,6 +1494,7 @@ class Scheduler(SchedulerInterface): spec_decoding_stats: SpecDecodingStats | None = None, kv_connector_stats: KVConnectorStats | None = None, cudagraph_stats: CUDAGraphStat | None = None, + perf_stats: PerfStats | None = None, ) -> SchedulerStats | None: if not self.log_stats: return None @@ -1510,6 +1520,7 @@ class Scheduler(SchedulerInterface): spec_decoding_stats=spec_stats, kv_connector_stats=connector_stats_payload, cudagraph_stats=cudagraph_stats, + perf_stats=perf_stats, ) def make_spec_decoding_stats( diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 9eaee1bb97bb9..2213b952c7a89 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -19,6 +19,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( from vllm.logger import init_logger from vllm.plugins import STAT_LOGGER_PLUGINS_GROUP, load_plugins_by_group from vllm.v1.engine import FinishReason +from vllm.v1.metrics.perf import PerfMetricsLogging from vllm.v1.metrics.prometheus import unregister_vllm_metrics from vllm.v1.metrics.stats import ( CachingMetrics, @@ -118,6 +119,9 @@ class LoggingStatLogger(StatLoggerBase): self.engine_is_idle = False self.aggregated = False + if self._enable_perf_stats(): + self.perf_metrics_logging = PerfMetricsLogging(vllm_config) + def _reset(self, now): self.last_log_time = now @@ -127,6 +131,9 @@ class LoggingStatLogger(StatLoggerBase): self.num_corrupted_reqs: int = 0 self.num_preemptions: int = 0 + def _enable_perf_stats(self) -> bool: + return self.vllm_config.observability_config.enable_mfu_metrics + def _track_iteration_stats(self, iteration_stats: IterationStats): # Save tracked stats for token counters. self.num_prompt_tokens += iteration_stats.num_prompt_tokens @@ -175,6 +182,8 @@ class LoggingStatLogger(StatLoggerBase): self.cudagraph_logging.observe(scheduler_stats.cudagraph_stats) if not self.aggregated: self.last_scheduler_stats = scheduler_stats + if (perf_stats := scheduler_stats.perf_stats) and self._enable_perf_stats(): + self.perf_metrics_logging.observe(perf_stats) if mm_cache_stats: self.mm_caching_metrics.observe(mm_cache_stats) @@ -211,7 +220,7 @@ class LoggingStatLogger(StatLoggerBase): "Running: %d reqs", "Waiting: %d reqs", ] - log_args = [ + log_args: list[int | float | str] = [ self.last_prompt_throughput, self.last_generation_throughput, self.last_scheduler_stats.num_running_reqs, @@ -254,6 +263,8 @@ class LoggingStatLogger(StatLoggerBase): self.kv_connector_logging.log(log_fn=log_fn) if self.cudagraph_logging is not None: self.cudagraph_logging.log(log_fn=log_fn) + if self._enable_perf_stats(): + self.perf_metrics_logging.log(log_fn=log_fn, log_prefix=self.log_prefix) def log_engine_initialized(self): if self.vllm_config.cache_config.num_gpu_blocks: @@ -282,6 +293,10 @@ class AggregatedLoggingStatLogger(LoggingStatLogger, AggregateStatLoggerBase): def log_prefix(self): return "{} Engines Aggregated: ".format(len(self.engine_indexes)) + def _enable_perf_stats(self) -> bool: + # Adding per_gpu perf stats across engines can lead to misleading numbers. + return False + def record( self, scheduler_stats: SchedulerStats | None, diff --git a/vllm/v1/metrics/perf.py b/vllm/v1/metrics/perf.py new file mode 100644 index 0000000000000..446a81fc4855d --- /dev/null +++ b/vllm/v1/metrics/perf.py @@ -0,0 +1,1244 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +Analytic flops/memory estimation module for transformer components, +to help derive MFU (Model Flops Utilization) stats for a running model. +""" + +import json +import time +from abc import ABC, abstractmethod +from collections.abc import Iterable +from dataclasses import asdict, dataclass +from typing import Any, Protocol + +import torch +from pydantic import BaseModel, Field, ValidationError, model_validator +from typing_extensions import Self + +import vllm.envs as envs +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.utils.torch_utils import ( + STR_DTYPE_TO_TORCH_DTYPE, + get_dtype_size, + get_kv_cache_torch_dtype, +) +from vllm.v1.core.sched.output import SchedulerOutput + +logger = init_logger(__name__) + + +class InvalidComponent(Exception): + """ + Custom exception to indicate that a certain ComponentMetric is not + applicable to the given VllmConfig. + """ + + pass + + +#### Basic Data Types #### + + +@dataclass +class DebugPerfStats: + ## Stats for debugging the metrics calculation + calc_duration: float = 0.0 # time spent calculating these stats + num_prefill_requests: int = 0 + num_decode_requests: int = 0 + context_breakdown: dict[str, int] | None = None + num_flops_per_gpu_breakdown: dict[str, int] | None = None + num_read_bytes_per_gpu_breakdown: dict[str, int] | None = None + num_write_bytes_per_gpu_breakdown: dict[str, int] | None = None + + +@dataclass +class PerfStats: + num_flops_per_gpu: int = 0 + num_read_bytes_per_gpu: int = 0 + num_write_bytes_per_gpu: int = 0 + debug_stats: DebugPerfStats | None = None + + +@dataclass +class ExecutionContext: + """ + Represents an execution context for a batch of requests. + + This class aggregates statistics across multiple requests in a batch, + separately tracking prefill and decode phases. + + Example) + - Batch with one full prefill (2048 tokens) and one decode (1 token, 8192 context): + ctx = ExecutionContext() + ctx.add(2048, 2048, is_prefill=True) + ctx.add(1, 8192, is_prefill=False) + """ + + # Prefill phase statistics + num_prefill_requests: int = 0 + prefill_num_tokens: int = 0 # sum of num_tokens for prefill requests + prefill_context_len: int = 0 # sum of context_len for prefill requests + prefill_token_context_product: int = 0 # sum of (num_tokens * context_len) + + # Decode phase statistics + num_decode_requests: int = 0 + decode_num_tokens: int = 0 # sum of num_tokens for decode requests + decode_context_len: int = 0 # sum of context_len for decode requests + decode_token_context_product: int = 0 # sum of (num_tokens * context_len) + + def add(self, num_tokens: int, context_len: int, is_prefill: bool) -> None: + """Add a single request's statistics to this batch context.""" + if is_prefill: + self.num_prefill_requests += 1 + self.prefill_num_tokens += num_tokens + self.prefill_context_len += context_len + self.prefill_token_context_product += num_tokens * context_len + else: + self.num_decode_requests += 1 + self.decode_num_tokens += num_tokens + self.decode_context_len += context_len + self.decode_token_context_product += num_tokens * context_len + + def total_num_tokens(self) -> int: + """Total number of tokens across all requests in the batch.""" + return self.prefill_num_tokens + self.decode_num_tokens + + def total_token_context_product(self) -> int: + """Total sum of (num_tokens * context_len) across all requests.""" + return self.prefill_token_context_product + self.decode_token_context_product + + @classmethod + def from_single_request( + cls, num_tokens: int, context_len: int, is_prefill: bool + ) -> "ExecutionContext": + """Create an ExecutionContext from a single request. + + This is a convenience method primarily for testing. + """ + ctx = cls() + ctx.add(num_tokens, context_len, is_prefill) + return ctx + + +class ParsedArgs: + """ + Syntactic sugar so that Parsers can use dot notations + to access/update the parsed arguments. + + e.g.) + args = ParsedArgs() + args.x = 3 + args.y = args.x + 1 + """ + + def __getattr__(self, name: str) -> Any: + raise AttributeError(f"'{type(self).__name__}' has no attribute '{name}'") + + def __setattr__(self, name: str, value: Any) -> None: + object.__setattr__(self, name, value) + + def model_dump(self) -> dict[str, Any]: + return vars(self).copy() + + +#### Abstract #### + + +class Parser(Protocol): + def parse(self, args: ParsedArgs, vllm_config: VllmConfig) -> ParsedArgs: + """ + Parse the vllm config and update the current ParsedArgs and pass it on. + If the parser isn't applicable to the vllm_config, it will do nothing. + """ + ... + + +class ParserChain: + """ + Applies chain of parser in a sequential order. + Later parsers might overwrite results from previous parsers, + so parsers should be chained in the appropriate order if they + are not mutually exclusive. + """ + + def __init__(self, *parsers: Parser) -> None: + self.parsers = list(parsers) + + def add_parser(self, parser: Parser) -> None: + self.parsers.append(parser) + + def parse(self, vllm_config: VllmConfig) -> ParsedArgs: + args = ParsedArgs() + for parser in self.parsers: + args = parser.parse(args, vllm_config) + return args + + +_COMPONENT_METRICS_REGISTRY: dict[str, type["ComponentMetrics"]] = {} + + +class ComponentMetrics(BaseModel, ABC): + """ + Each concrete ComponentMetrics class is associated with: + - fields that are required for metric derivation + (fields are specified/validated through pydantic model) + - parser to parse VllmConfig into fields + - metric methods that derive flops/bytes for a given execution context + """ + + @classmethod + @abstractmethod + def component_type(cls) -> str: ... + + @classmethod + @abstractmethod + def get_parser(cls) -> ParserChain: + """ + Return a ParserChain that provides values for all required fields. + The returned parser chain must populate ParsedArgs with values for every + field defined on this ComponentMetrics class. Missing fields will cause + a ValidationError when from_vllm_config() is called. + See individual Parser docstrings for which args they provide, and field + comments on ComponentMetrics subclasses for which parser provides each field. + """ + ... + + def __init_subclass__(cls): + _COMPONENT_METRICS_REGISTRY[cls.component_type()] = cls + + @classmethod + def from_vllm_config(cls, vllm_config: VllmConfig) -> Self: + """ + Instantiate this class from VllmConfig. + Raises ValidationError if parsing fails. + """ + + parser = cls.get_parser() + parsed_args = parser.parse(vllm_config) + try: + return cls.model_validate(parsed_args.model_dump()) + except ValidationError as e: + raise InvalidComponent(f"Invalid {cls.component_type()} config: {e}") from e + + @classmethod + def registered_metrics(cls) -> Iterable[type["ComponentMetrics"]]: + return iter(_COMPONENT_METRICS_REGISTRY.values()) + + @abstractmethod + def get_num_flops_breakdown( + self, ctx: ExecutionContext, per_gpu: bool = True + ) -> dict[str, int]: ... + + @abstractmethod + def get_read_bytes_breakdown( + self, ctx: ExecutionContext, per_gpu: bool = True + ) -> dict[str, int]: ... + + @abstractmethod + def get_write_bytes_breakdown( + self, ctx: ExecutionContext, per_gpu: bool = True + ) -> dict[str, int]: ... + + def get_num_flops(self, ctx: ExecutionContext, per_gpu: bool = True) -> int: + return sum(self.get_num_flops_breakdown(ctx, per_gpu).values()) + + def get_read_bytes(self, ctx: ExecutionContext, per_gpu: bool = True) -> int: + return sum(self.get_read_bytes_breakdown(ctx, per_gpu).values()) + + def get_write_bytes(self, ctx: ExecutionContext, per_gpu: bool = True) -> int: + return sum(self.get_write_bytes_breakdown(ctx, per_gpu).values()) + + +#### parsers #### + + +class BaseConfigParser(Parser): + """ + Parses base model configuration. + Provides: vocab_size, hidden_size, num_attention_heads, num_hidden_layers, + weight_byte_size, activation_byte_size, dp_size, tp_size, pp_size, enable_ep + """ + + def parse(self, args: ParsedArgs, vllm_config: VllmConfig) -> ParsedArgs: + model_config = vllm_config.model_config + + args.vocab_size = model_config.get_vocab_size() + args.hidden_size = model_config.get_hidden_size() + # NOTE: model_config.get_attention_heads() divide by TP + # so we access field manually here to get total num_heads + args.num_attention_heads = get_required( + model_config.hf_text_config, "num_attention_heads" + ) + args.num_hidden_layers = get_required( + model_config.hf_text_config, "num_hidden_layers" + ) + + model_dtype = vllm_config.model_config.dtype + + if isinstance(model_dtype, torch.dtype): + torch_dtype = model_dtype + elif isinstance(model_dtype, str) and model_dtype in STR_DTYPE_TO_TORCH_DTYPE: + torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[model_dtype] + else: + # FIXME: handle this better + logger.warning( + "Unknown model_dtype %s, defaulting to bfloat16", + model_dtype, + ) + torch_dtype = torch.bfloat16 + + args.weight_byte_size = get_dtype_size(torch_dtype) + + # FIXME: handle this better by parsing whether activations use + # bf16, fp32, etc... + args.activation_byte_size = 2 + + args.dp_size = vllm_config.parallel_config.data_parallel_size + args.tp_size = vllm_config.parallel_config.tensor_parallel_size + args.pp_size = vllm_config.parallel_config.pipeline_parallel_size + args.enable_ep = vllm_config.parallel_config.enable_expert_parallel + + return args + + +#### Attention #### + + +class BaseAttentionConfigParser(Parser): + """ + Parses attention-specific configuration. + Provides: num_key_value_heads, head_dim, cache_byte_size + """ + + def parse(self, args: ParsedArgs, vllm_config: VllmConfig) -> ParsedArgs: + model_config = vllm_config.model_config + + args.num_key_value_heads = model_config.get_total_num_kv_heads() + args.head_dim = model_config.get_head_size() + + model_dtype = vllm_config.model_config.dtype + cache_dtype = vllm_config.cache_config.cache_dtype + + kv_cache_torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype) + args.cache_byte_size = get_dtype_size(kv_cache_torch_dtype) + + return args + + +class AttentionQuantizationConfigParser(Parser): + """ + Parses quantization configuration for attention layers. + Overrides: weight_byte_size + """ + + def parse(self, args: ParsedArgs, vllm_config: VllmConfig) -> ParsedArgs: + cfg = vllm_config.quant_config + + if cfg is None: + return args + + quant_method = cfg.get_name() + if quant_method in ["fp8", "fbgemm_fp8"]: + # FIXME: This is a hacky coarse-grained fp8 quantization detection. + # FIXME: These configs also have concept of "ignored layers" and we + # need to solve the same problem as above. + args.weight_byte_size = 1 + elif quant_method == "mxfp4": + # FIXME: Also has "ignored layers" issue above + args.weight_byte_size = 0.5 + else: + # FIXME: Add more parsing logic for different quant methods. + raise InvalidComponent + + return args + + +class AttentionMetrics(ComponentMetrics): + # From BaseConfigParser + num_hidden_layers: int = Field(..., gt=0) + hidden_size: int = Field(..., gt=0) + num_attention_heads: int = Field(..., gt=0) + activation_byte_size: int = Field(..., gt=0) + tp_size: int = Field(..., gt=0) + pp_size: int = Field(..., gt=0) + + # From BaseAttentionConfigParser + num_key_value_heads: int = Field(..., gt=0) + head_dim: int = Field(..., gt=0) + cache_byte_size: int = Field(..., gt=0) + + # From BaseConfig Parser, overridden by AttentionQuantizationConfigParser + weight_byte_size: int | float = Field(..., gt=0) + + # TODO: discern cases where we have mixture of different attention layer types + # such as SWA, MLA, etc. + + @classmethod + def component_type(cls) -> str: + return "attn" + + @classmethod + def get_parser(cls) -> ParserChain: + return ParserChain( + BaseConfigParser(), + BaseAttentionConfigParser(), + AttentionQuantizationConfigParser(), + ) + + def get_num_flops_breakdown( + self, ctx: ExecutionContext, per_gpu: bool = True + ) -> dict[str, int]: + L, D, q, kv, d = ( + self.num_hidden_layers, + self.hidden_size, + self.num_attention_heads, + self.num_key_value_heads, + self.head_dim, + ) + T = ctx.total_num_tokens() + TC = ctx.total_token_context_product() + + if per_gpu: + L //= self.pp_size + # tensor parallel along heads + q = max(1, q // self.tp_size) + kv = max(1, kv // self.tp_size) + + return { + "qkv_proj": 2 * T * D * (q + 2 * kv) * d * L, + "attn_qk": 2 * q * TC * d * L, + "attn_av": 2 * q * TC * d * L, + "out_proj": 2 * T * D * q * d * L, + } + + def get_read_bytes_breakdown( + self, ctx: ExecutionContext, per_gpu: bool = True + ) -> dict[str, int]: + L, D, q, kv, d = ( + self.num_hidden_layers, + self.hidden_size, + self.num_attention_heads, + self.num_key_value_heads, + self.head_dim, + ) + T = ctx.total_num_tokens() + + if per_gpu: + L //= self.pp_size + # tensor parallel along heads + q = max(1, q // self.tp_size) + kv = max(1, kv // self.tp_size) + + read_bytes = {} + + read_bytes["qkv_input"] = T * D * self.activation_byte_size * L + read_bytes["qkv_weight"] = int(D * (q + 2 * kv) * d * self.weight_byte_size * L) + + # Attention input reads differ between prefill and decode + # Prefill: read Q, K, V activations (all in activation_byte_size) + if ctx.prefill_num_tokens > 0: + read_bytes["attn_input"] = ( + (ctx.prefill_num_tokens * q + 2 * ctx.prefill_context_len * kv) + * d + * self.activation_byte_size + * L + ) + + # Decode: read Q activations + read K, V from cache (in cache_byte_size) + if ctx.decode_num_tokens > 0: + read_bytes["attn_input"] = read_bytes.get("attn_input", 0) + ( + ctx.decode_num_tokens * q * d * self.activation_byte_size * L + + 2 * ctx.decode_context_len * kv * d * self.cache_byte_size * L + ) + + read_bytes["out_input"] = T * q * d * self.activation_byte_size * L + read_bytes["out_weight"] = int(q * d * D * self.weight_byte_size * L) + + return read_bytes + + def get_write_bytes_breakdown( + self, ctx: ExecutionContext, per_gpu: bool = True + ) -> dict[str, int]: + """Calculate write memory traffic for attention layers.""" + L, D, q, kv, d = ( + self.num_hidden_layers, + self.hidden_size, + self.num_attention_heads, + self.num_key_value_heads, + self.head_dim, + ) + T = ctx.total_num_tokens() + + if per_gpu: + L //= self.pp_size + # tensor parallel along heads + q = max(1, q // self.tp_size) + kv = max(1, kv // self.tp_size) + + return { + "qkv_output": T * (q + 2 * kv) * d * self.activation_byte_size * L, + "kv_cache": 2 * T * kv * d * self.cache_byte_size * L, + "out_output": T * D * self.activation_byte_size * L, + } + + +#### Ffn #### + + +class BaseFfnConfigParser(Parser): + """ + Parses FFN and MoE configuration. + Provides: intermediate_size, num_experts, num_experts_per_tok, + moe_intermediate_size, num_shared_experts, num_moe_layers + """ + + def parse(self, args: ParsedArgs, vllm_config: VllmConfig) -> ParsedArgs: + cfg = vllm_config.model_config.hf_config + if hasattr(cfg, "text_config") and cfg.text_config is not None: + cfg = cfg.text_config + + args.intermediate_size = getattr(cfg, "intermediate_size", args.hidden_size * 4) + + # Try different naming conventions. + args.num_experts = vllm_config.model_config.get_num_experts() + args.num_experts_per_tok = getattr_from_list( + cfg, ["num_experts_per_tok", "moe_topk"], 0 + ) + args.moe_intermediate_size = getattr_from_list( + cfg, ["moe_intermediate_size", "intermediate_size"], 0 + ) + args.num_shared_experts = getattr_from_list( + cfg, ["n_shared_experts", "num_shared_experts"], 0 + ) + + is_moe = args.num_experts != 0 + # Assume all MoE layers by default + args.num_moe_layers = args.num_hidden_layers if is_moe else 0 + + return args + + +class FfnParallelParser(Parser): + """ + Parses FFN parallelism configuration. + + Provides: ffn_tp_size, ffn_ep_size + """ + + def parse(self, args: ParsedArgs, vllm_config: VllmConfig) -> ParsedArgs: + # NOTE: ffn tp_size does not equal the tp_size parameter directly. + # e.g.) If we use DP2TP4, ffn will use TP8 (or EP8 if EP is enabled.) + if args.enable_ep: + ffn_tp_size, ffn_ep_size = 1, args.dp_size * args.tp_size + else: + ffn_tp_size, ffn_ep_size = args.dp_size * args.tp_size, 1 + + args.ffn_tp_size = ffn_tp_size + args.ffn_ep_size = ffn_ep_size + + return args + + +class InterleaveMoeLayerStepParser(Parser): + """ + Parses interleave_moe_layer_step field for models like Llama4. + + Overrides: num_moe_layers + """ + + def parse(self, args: ParsedArgs, vllm_config: VllmConfig) -> ParsedArgs: + cfg = vllm_config.model_config.hf_config + if hasattr(cfg, "text_config") and cfg.text_config is not None: + cfg = cfg.text_config + + if ( + hasattr(cfg, "interleave_moe_layer_step") + and cfg.interleave_moe_layer_step > 0 + ): + args.num_moe_layers = len( + [ + layer + for layer in range(args.num_hidden_layers) + if (layer + 1) % cfg.interleave_moe_layer_step == 0 + ] + ) + + return args + + +class MoeLayerFreqParser(Parser): + """ + Parses moe_layer_freq and first_k_dense_replace fields for models like Deepseek. + + Overrides: num_moe_layers + """ + + def parse(self, args: ParsedArgs, vllm_config: VllmConfig) -> ParsedArgs: + cfg = vllm_config.model_config.hf_config + if hasattr(cfg, "text_config") and cfg.text_config is not None: + cfg = cfg.text_config + + if hasattr(cfg, "moe_layer_freq") and hasattr(cfg, "first_k_dense_replace"): + args.num_moe_layers = len( + [ + layer + for layer in range(args.num_hidden_layers) + if layer >= cfg.first_k_dense_replace + and layer % cfg.moe_layer_freq == 0 + ] + ) + + return args + + +class FfnQuantizationConfigParser(Parser): + """ + Parses quantization configuration for FFN layers. + + Overrides: weight_byte_size + """ + + def parse(self, args: ParsedArgs, vllm_config: VllmConfig) -> ParsedArgs: + cfg = vllm_config.quant_config + + if cfg is None: + return args + + quant_method = cfg.get_name() + if quant_method in ["fp8", "fbgemm_fp8"]: + # FIXME: This is a hacky coarse-grained fp8 quantization detection. + # (there might be more quantization methods for fp8). + # FIXME: These configs also have concept of "ignored layers" and we + # need to solve the same problem as above. + args.weight_byte_size = 1 + pass + elif quant_method == "mxfp4": + # FIXME: Also has "ignored layers" issue above + args.weight_byte_size = 0.5 + else: + # FIXME: Add more parsing logic for different quant methods. + raise InvalidComponent + + return args + + +class FfnMetrics(ComponentMetrics): + # From BaseConfigParser + num_hidden_layers: int = Field(..., gt=0) + hidden_size: int = Field(..., gt=0) + activation_byte_size: int = Field(..., gt=0) + pp_size: int = Field(..., gt=0) + + # From FfnParallelParser + ffn_tp_size: int = Field(..., gt=0) + ffn_ep_size: int = Field(..., gt=0) + + # From BaseFfnConfigParser + intermediate_size: int = Field(..., gt=0) + num_experts: int = Field(0) + num_experts_per_tok: int = Field(1) + moe_intermediate_size: int = Field(0) + num_shared_experts: int = Field(0) + + # From BaseConfigParser, can be overridden InterleaveMoeLayerStep or MoeLayerFreq + num_moe_layers: int = Field(..., ge=0) + + # FIXME: might have to make this more granular + # (i.e. dense_weight_byte_size, moe_routed_weight_byte_size, + # moe_shared_weight_byte_size) + # since it can differ from byte size of other components (e.g. attn) + # and can differ even from each other. + + # From BaseConfigParser, can be overridden by FfnQuantizationConfigParser + weight_byte_size: int | float = Field(..., gt=0) + + @model_validator(mode="after") + def validate_moe_fields(self) -> Self: + """Validate that MoE-related fields are properly set when num_moe_layers > 0.""" + if self.num_moe_layers > 0: + assert self.num_experts, f"{self.num_experts=}" + assert self.num_experts_per_tok, f"{self.num_experts_per_tok=}" + assert self.moe_intermediate_size, f"{self.moe_intermediate_size=}" + return self + + @classmethod + def component_type(cls) -> str: + return "ffn" + + @classmethod + def get_parser(cls) -> ParserChain: + return ParserChain( + BaseConfigParser(), + FfnParallelParser(), + BaseFfnConfigParser(), + InterleaveMoeLayerStepParser(), + MoeLayerFreqParser(), + FfnQuantizationConfigParser(), + ) + + def get_num_flops_breakdown( + self, ctx: ExecutionContext, per_gpu: bool = True + ) -> dict[str, int]: + """Calculate flops breakdown for FFN layers.""" + L, D, DI = self.num_hidden_layers, self.hidden_size, self.intermediate_size + Lm, E, MI, S = ( + self.num_moe_layers, + self.num_experts_per_tok, + self.moe_intermediate_size, + self.num_shared_experts, + ) + T = ctx.total_num_tokens() + + Ld = L - Lm + + num_activated_tokens = T * E if E else 0 + + if per_gpu: + Ld //= self.pp_size + Lm //= self.pp_size + + DI //= self.ffn_tp_size + if MI is not None: + MI //= self.ffn_tp_size + if E: + num_activated_tokens //= self.ffn_ep_size + + flops = {} + + # Dense FFN layers (SwiGLU: 3 linear layers: up, gate, down) + if Ld: + flops["dense_ffn"] = 2 * D * 3 * DI * T * Ld + + # MoE routed experts (each token activates E experts) + if Lm and E: + flops["routed_ffn"] = 2 * D * 3 * MI * num_activated_tokens * Lm + + # MoE shared experts (all S shared experts run for every token) + if Lm and S: + flops["shared_ffn"] = 2 * D * 3 * MI * S * T * Lm + + return flops + + def get_read_bytes_breakdown( + self, ctx: ExecutionContext, per_gpu: bool = True + ) -> dict[str, int]: + """Calculate read memory traffic for FFN layers.""" + L, D, DI = self.num_hidden_layers, self.hidden_size, self.intermediate_size + Lm, E, MI, S = ( + self.num_moe_layers, + self.num_experts_per_tok, + self.moe_intermediate_size, + self.num_shared_experts, + ) + T = ctx.total_num_tokens() + num_experts = self.num_experts + + Ld = L - Lm + + num_activated_tokens = T * E if E else 0 + + if per_gpu: + Ld //= self.pp_size + Lm //= self.pp_size + + DI //= self.ffn_tp_size + if MI is not None: + MI //= self.ffn_tp_size + if E: + num_activated_tokens //= self.ffn_ep_size + if num_experts is not None: + num_experts //= self.ffn_ep_size + + read_bytes = {} + + # Dense FFN layers (3 GEMMs: up, gate, down projections + SiLU activation) + if Ld: + read_bytes["dense_up_gate_input"] = int( + T * D * self.activation_byte_size * Ld + ) + read_bytes["dense_up_gate_weights"] = int( + 2 * D * DI * self.weight_byte_size * Ld + ) + read_bytes["dense_silu_input"] = int( + 2 * T * DI * self.activation_byte_size * Ld + ) + read_bytes["dense_down_input"] = int( + T * DI * self.activation_byte_size * Ld + ) + read_bytes["dense_down_weights"] = int(D * DI * self.weight_byte_size * Ld) + + if Lm: + # MoE routed expert reads + if E: + # FIXME: Assume perfect load balancing for now. + num_activated_experts = min(num_activated_tokens, num_experts) + + read_bytes["routed_up_gate_input"] = int( + num_activated_tokens * D * self.activation_byte_size * Lm + ) + read_bytes["routed_up_gate_weights"] = int( + 2 * D * MI * num_activated_experts * self.weight_byte_size * Lm + ) + read_bytes["routed_silu_input"] = int( + 2 * num_activated_tokens * MI * self.activation_byte_size * Lm + ) + read_bytes["routed_down_input"] = int( + num_activated_tokens * MI * self.activation_byte_size * Lm + ) + read_bytes["routed_down_weights"] = int( + D * MI * num_activated_experts * self.weight_byte_size * Lm + ) + + # MoE shared expert reads + if S: + read_bytes["shared_up_gate_input"] = int( + T * D * self.activation_byte_size * Lm + ) + read_bytes["shared_up_gate_weights"] = int( + 2 * D * MI * S * self.weight_byte_size * Lm + ) + read_bytes["shared_silu_input"] = int( + 2 * T * MI * S * self.activation_byte_size * Lm + ) + read_bytes["shared_down_input"] = int( + T * MI * self.activation_byte_size * Lm + ) + read_bytes["shared_down_weights"] = int( + D * MI * S * self.weight_byte_size * Lm + ) + + return read_bytes + + def get_write_bytes_breakdown( + self, ctx: ExecutionContext, per_gpu: bool = True + ) -> dict[str, int]: + """Calculate write memory traffic for FFN layers.""" + L, D, DI = self.num_hidden_layers, self.hidden_size, self.intermediate_size + Lm, E, MI, S = ( + self.num_moe_layers, + self.num_experts_per_tok, + self.moe_intermediate_size, + self.num_shared_experts, + ) + T = ctx.total_num_tokens() + + Ld = L - Lm + + num_activated_tokens = T * E if E else 0 + + if per_gpu: + Ld //= self.pp_size + Lm //= self.pp_size + + DI //= self.ffn_tp_size + if MI is not None: + MI //= self.ffn_tp_size + if E: + num_activated_tokens //= self.ffn_ep_size + + write_bytes = {} + + # Dense FFN layers + if Ld: + write_bytes["dense_up_gate_output"] = int( + 2 * T * DI * self.activation_byte_size * Ld + ) + write_bytes["dense_silu_output"] = int( + T * DI * self.activation_byte_size * Ld + ) + write_bytes["dense_down_output"] = int( + T * D * self.activation_byte_size * Ld + ) + + # MoE outputs + if Lm: + if E: + write_bytes["routed_up_gate_output"] = int( + 2 * num_activated_tokens * MI * self.activation_byte_size * Lm + ) + write_bytes["routed_silu_output"] = int( + num_activated_tokens * MI * self.activation_byte_size * Lm + ) + write_bytes["routed_down_output"] = int( + num_activated_tokens * D * self.activation_byte_size * Lm + ) + if S: + write_bytes["shared_up_gate_output"] = int( + 2 * T * S * MI * self.activation_byte_size * Lm + ) + write_bytes["shared_silu_output"] = int( + T * S * MI * self.activation_byte_size * Lm + ) + write_bytes["shared_down_output"] = int( + T * S * D * self.activation_byte_size * Lm + ) + + return write_bytes + + +#### Unembed #### + + +class UnembedMetrics(ComponentMetrics): + # From BaseConfigParser + hidden_size: int = Field(..., gt=0) + vocab_size: int = Field(..., gt=0) + weight_byte_size: int = Field(..., gt=0) + activation_byte_size: int = Field(..., gt=0) + + tp_size: int + + @classmethod + def component_type(cls) -> str: + return "unembed" + + @classmethod + def get_parser(cls) -> ParserChain: + return ParserChain( + BaseConfigParser(), + ) + + def get_num_flops_breakdown( + self, ctx: ExecutionContext, per_gpu: bool = True + ) -> dict[str, int]: + """Calculate flops breakdown for unembedding layer.""" + D, V = self.hidden_size, self.vocab_size + T = ctx.total_num_tokens() + + if per_gpu: + V //= self.tp_size + + return { + "unembed": 2 * T * D * V, + } + + def get_read_bytes_breakdown( + self, ctx: ExecutionContext, per_gpu: bool = True + ) -> dict[str, int]: + """Calculate read memory traffic for unembedding layer.""" + D, V = self.hidden_size, self.vocab_size + T = ctx.total_num_tokens() + + if per_gpu: + V //= self.tp_size + + return { + "input": T * D * self.activation_byte_size, + "weight": D * V * self.weight_byte_size, + } + + def get_write_bytes_breakdown( + self, ctx: ExecutionContext, per_gpu: bool = True + ) -> dict[str, int]: + """Calculate write memory traffic for unembedding layer.""" + V = self.vocab_size + T = ctx.total_num_tokens() + + if per_gpu: + V //= self.tp_size + + return { + "output": T * V * self.activation_byte_size, + } + + +#### ModelMetrics #### + + +class ModelMetrics: + def __init__(self, vllm_config: VllmConfig) -> None: + """ + Parse vllm_config to instantiate metrics for each component. + is_enabled() will return False if no component metrics could be instantiated. + """ + + self.vllm_config = vllm_config + + self.metrics: list[ComponentMetrics] = [] + for metric_cls in ComponentMetrics.registered_metrics(): + try: + metric = metric_cls.from_vllm_config(vllm_config) + self.metrics.append(metric) + logger.info( + "Instantiated ComponentMetrics [%s] with (%s)", + metric.component_type(), + str(metric), + ) + except InvalidComponent as e: + logger.debug( + "Failed to instantiate %s from %s", + metric_cls.component_type(), + str(e), + ) + + def is_enabled(self) -> bool: + return len(self.metrics) > 0 + + def get_num_flops(self, ctx: ExecutionContext, per_gpu: bool = True) -> int: + return sum(metric.get_num_flops(ctx, per_gpu) for metric in self.metrics) + + def get_read_bytes(self, ctx: ExecutionContext, per_gpu: bool = True) -> int: + return sum(metric.get_read_bytes(ctx, per_gpu) for metric in self.metrics) + + def get_write_bytes(self, ctx: ExecutionContext, per_gpu: bool = True) -> int: + return sum(metric.get_write_bytes(ctx, per_gpu) for metric in self.metrics) + + def get_num_flops_breakdown( + self, ctx: ExecutionContext, per_gpu: bool = True + ) -> dict[str, int]: + total = {} + for metric in self.metrics: + breakdown = metric.get_num_flops_breakdown(ctx, per_gpu) + component = metric.component_type() + prefixed = {f"{component}.{key}": val for key, val in breakdown.items()} + total.update(prefixed) + return total + + def get_read_bytes_breakdown( + self, ctx: ExecutionContext, per_gpu: bool = True + ) -> dict[str, int]: + total = {} + for metric in self.metrics: + breakdown = metric.get_read_bytes_breakdown(ctx, per_gpu) + component = metric.component_type() + prefixed = {f"{component}.{key}": val for key, val in breakdown.items()} + total.update(prefixed) + return total + + def get_write_bytes_breakdown( + self, ctx: ExecutionContext, per_gpu: bool = True + ) -> dict[str, int]: + total = {} + for metric in self.metrics: + breakdown = metric.get_write_bytes_breakdown(ctx, per_gpu) + component = metric.component_type() + prefixed = {f"{component}.{key}": val for key, val in breakdown.items()} + total.update(prefixed) + return total + + def get_step_perf_stats_per_gpu( + self, scheduler_output: SchedulerOutput + ) -> PerfStats: + """ + Calculate perf stats for the current step based on scheduled tokens. + """ + + t0 = time.monotonic() + + # Build a single batch context + ctx = ExecutionContext() + + # Process new requests (these are in prefill phase) + for new_req in scheduler_output.scheduled_new_reqs: + req_id = new_req.req_id + num_tokens = scheduler_output.num_scheduled_tokens.get(req_id, 0) + if num_tokens == 0: + continue + + # For new requests, context_len = num_computed_tokens + num_tokens + # num_computed_tokens represents previously computed tokens in the sequence + context_len = new_req.num_computed_tokens + num_tokens + ctx.add(num_tokens, context_len, is_prefill=True) + + # Process cached requests (continuing requests) + cached_reqs = scheduler_output.scheduled_cached_reqs + for i, req_id in enumerate(cached_reqs.req_ids): + num_tokens = scheduler_output.num_scheduled_tokens.get(req_id, 0) + if num_tokens == 0: + continue + + # For cached requests, we have the current num_computed_tokens + num_computed_tokens = cached_reqs.num_computed_tokens[i] + context_len = num_computed_tokens + num_tokens + + # Cached requests are typically in decode phase (num_tokens == 1) + # unless they're doing chunked prefill (num_tokens > 1) + is_prefill = num_tokens > 1 + ctx.add(num_tokens, context_len, is_prefill) + + num_flops_breakdown = self.get_num_flops_breakdown(ctx, True) + read_bytes_breakdown = self.get_read_bytes_breakdown(ctx, True) + write_bytes_breakdown = self.get_write_bytes_breakdown(ctx, True) + perf_stats = PerfStats( + sum(num_flops_breakdown.values()), + sum(read_bytes_breakdown.values()), + sum(write_bytes_breakdown.values()), + ) + + if envs.VLLM_DEBUG_MFU_METRICS: + perf_stats.debug_stats = DebugPerfStats( + time.monotonic() - t0, + ctx.num_prefill_requests, + ctx.num_decode_requests, + asdict(ctx), + num_flops_breakdown, + read_bytes_breakdown, + write_bytes_breakdown, + ) + + return perf_stats + + +#### Logging #### + + +class PerfMetricsDebugLogging: + def __init__(self): + self.reset() + + def reset(self): + self.total_calc_duration: float = 0.0 + self.total_num_prefill_requests: int = 0 + self.total_num_decode_requests: int = 0 + self.total_num_batches: int = 0 + self.total_context_breakdown: dict[str, int] = {} + self.total_num_flops_per_gpu_breakdown: dict[str, int] = {} + self.total_read_bytes_per_gpu_breakdown: dict[str, int] = {} + self.total_write_bytes_per_gpu_breakdown: dict[str, int] = {} + + def observe(self, debug_stats: DebugPerfStats) -> None: + self.total_calc_duration += debug_stats.calc_duration + self.total_num_prefill_requests += debug_stats.num_prefill_requests + self.total_num_decode_requests += debug_stats.num_decode_requests + self.total_num_batches += 1 + + for dst, src in zip( + [ + self.total_context_breakdown, + self.total_num_flops_per_gpu_breakdown, + self.total_read_bytes_per_gpu_breakdown, + self.total_write_bytes_per_gpu_breakdown, + ], + [ + debug_stats.context_breakdown, + debug_stats.num_flops_per_gpu_breakdown, + debug_stats.num_read_bytes_per_gpu_breakdown, + debug_stats.num_write_bytes_per_gpu_breakdown, + ], + ): + assert isinstance(src, dict) + for key, val in src.items(): + dst[key] = dst.get(key, 0) + val + + def log(self, log_fn, log_prefix: str, delta_time: float): + # pretty print breakdowns + total_num_flops_per_gpu_breakdown = { + k: f"{v / 1e12:.1f}TF" + for k, v in self.total_num_flops_per_gpu_breakdown.items() + } + total_read_bytes_per_gpu_breakdown = { + k: f"{v / 1e9:.1f}GB" + for k, v in self.total_read_bytes_per_gpu_breakdown.items() + } + total_write_bytes_per_gpu_breakdown = { + k: f"{v / 1e9:.1f}GB" + for k, v in self.total_write_bytes_per_gpu_breakdown.items() + } + + logger.debug( + "%sMFU details: %s", + log_prefix, + json.dumps( + { + "prefill_reqs": self.total_num_prefill_requests, + "decode_reqs": self.total_num_decode_requests, + "num_batches": self.total_num_batches, + "context_breakdown": self.total_context_breakdown, + "flops_breakdown": total_num_flops_per_gpu_breakdown, + "num_read_bytes_breakdown": total_read_bytes_per_gpu_breakdown, + "num_write_bytes_breakdown": (total_write_bytes_per_gpu_breakdown), + "duration": f"{delta_time:.1f}s", + "mfu_calc_overhead": ( + f"{self.total_calc_duration / delta_time:.1%}" + ), + }, + indent=2, + ), + ) + + +class PerfMetricsLogging: + def __init__(self, vllm_config: VllmConfig): + self.vllm_config = vllm_config + self.pp_size = vllm_config.parallel_config.pipeline_parallel_size + + self.debug_logging: PerfMetricsDebugLogging | None = None + if envs.VLLM_DEBUG_MFU_METRICS: + self.debug_logging = PerfMetricsDebugLogging() + + self.reset() + + def reset(self): + self.last_log_time = time.monotonic() + + self.total_num_flops_per_gpu: int = 0 + self.total_read_bytes_per_gpu: int = 0 + self.total_write_bytes_per_gpu: int = 0 + + if self.debug_logging: + self.debug_logging.reset() + + def observe(self, perf_stats: PerfStats) -> None: + self.total_num_flops_per_gpu += perf_stats.num_flops_per_gpu + self.total_read_bytes_per_gpu += perf_stats.num_read_bytes_per_gpu + self.total_write_bytes_per_gpu += perf_stats.num_write_bytes_per_gpu + + if self.debug_logging: + assert perf_stats.debug_stats is not None + self.debug_logging.observe(perf_stats.debug_stats) + + def log(self, log_fn=logger.info, log_prefix: str = "") -> None: + if not ( + self.total_num_flops_per_gpu + or self.total_read_bytes_per_gpu + or self.total_write_bytes_per_gpu + ): + return + + now = time.monotonic() + delta_time = now - self.last_log_time + + if delta_time <= 0.0: + avg_tflops_per_gpu = 0.0 + avg_gbps_per_gpu = 0.0 + else: + avg_tflops_per_gpu = self.total_num_flops_per_gpu / delta_time / 1e12 + avg_gbps_per_gpu = ( + (self.total_read_bytes_per_gpu + self.total_write_bytes_per_gpu) + / delta_time + / 1e9 + ) + + log_fn( + "%sMFU: %.1f TF/s/GPU %.1f GB/s/GPU", + log_prefix, + avg_tflops_per_gpu, + avg_gbps_per_gpu, + ) + + if self.debug_logging: + self.debug_logging.log(log_fn, log_prefix, delta_time) + + self.reset() + + +## util functions + + +def get_required(obj: object, attr: str): + """Get an attr from an object, or throw a InvalidComponentError if it's not set.""" + if not hasattr(obj, attr): + raise InvalidComponent(f"Missing required attr {attr} in config") + return getattr(obj, attr) + + +def getattr_from_list(obj: object, attrs: list[str], default: object = None): + """Try to get the first attr that exists in the object + from a list of attrs. Otherwise return None.""" + for attr in attrs: + if hasattr(obj, attr): + return getattr(obj, attr) + return default diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index a0cc58d0a64e8..cb1a860e38fbc 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -8,6 +8,7 @@ from typing import TYPE_CHECKING, Any import vllm.envs as envs from vllm.compilation.cuda_graph import CUDAGraphStat +from vllm.v1.metrics.perf import PerfStats from vllm.v1.spec_decode.metrics import SpecDecodingStats if TYPE_CHECKING: @@ -186,6 +187,8 @@ class SchedulerStats: cudagraph_stats: CUDAGraphStat | None = None + perf_stats: PerfStats | None = None + @dataclass class RequestStateStats: From fd8afdf38dad8bf7ccc4e7fcc3d4aaa4d6d9e0d8 Mon Sep 17 00:00:00 2001 From: Micah Williamson Date: Wed, 17 Dec 2025 20:27:37 -0600 Subject: [PATCH 038/176] [ROCm][CI] Reduce Flakiness For test_async_scheduling Using ROCM_ATTN With FP32 (#30811) Signed-off-by: Micah Williamson --- tests/v1/e2e/test_async_scheduling.py | 12 ++---------- vllm/v1/attention/backends/rocm_attn.py | 6 +++++- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py index 61e56c079a3b5..6447a33838d75 100644 --- a/tests/v1/e2e/test_async_scheduling.py +++ b/tests/v1/e2e/test_async_scheduling.py @@ -148,7 +148,7 @@ def run_tests( # Use TRITON_ATTN for spec decoding test for consistency attention_config = {"backend": "TRITON_ATTN"} else: - attention_config = {"backend": "ROCM_AITER_FA"} + attention_config = {"backend": "ROCM_ATTN"} else: attention_config = {"backend": "FLEX_ATTENTION"} @@ -284,14 +284,6 @@ def run_test( print(f"---- TESTING {test_str}: {test_config}") print("-" * 80) - # On ROCm: use float16 for first test (ROCM_AITER_FA), but float32 for - # spec decoding test (TRITON_ATTN) for better precision. - # On others: always use float32. - if current_platform.is_rocm() and not is_testing_with_spec_decoding: - dtype = "float16" - else: - dtype = "float32" - with VllmRunner( model, max_model_len=512, @@ -301,7 +293,7 @@ def run_test( # enforce_eager=True, async_scheduling=async_scheduling, distributed_executor_backend=executor, - dtype=dtype, + dtype="float32", speculative_config=spec_config, disable_log_stats=False, attention_config=attention_config, diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py index e231c600cba7a..3701373f33315 100644 --- a/vllm/v1/attention/backends/rocm_attn.py +++ b/vllm/v1/attention/backends/rocm_attn.py @@ -152,7 +152,11 @@ class RocmAttentionMetadataBuilder(AttentionMetadataBuilder[RocmAttentionMetadat class RocmAttentionBackend(AttentionBackend): accept_output_buffer: bool = True - supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16] + supported_dtypes: ClassVar[list[torch.dtype]] = [ + torch.float16, + torch.bfloat16, + torch.float32, + ] @classmethod def get_supported_head_sizes(cls) -> list[int]: From bc3700e0cd8875951b87b17accd7aa4d80ddca50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Thu, 18 Dec 2025 04:53:30 +0100 Subject: [PATCH 039/176] [NIXL] Support P tensor-parallel-size > D tensor-parallel-size (#27274) Signed-off-by: NickLucche --- .../tp_config_sweep_accuracy_test.sh | 3 + .../kv_connector/unit/test_nixl_connector.py | 245 +++++++++- .../kv_transfer/kv_connector/utils.py | 62 ++- .../kv_connector/v1/nixl_connector.py | 458 +++++++++++------- 4 files changed, 556 insertions(+), 212 deletions(-) diff --git a/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh index 8199fd516cd43..f6b4498ceb371 100755 --- a/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh +++ b/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh @@ -8,9 +8,12 @@ SCRIPT="v1/kv_connector/nixl_integration/run_accuracy_test.sh" configs=( "GPU_MEMORY_UTILIZATION=0.6 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=2" "GPU_MEMORY_UTILIZATION=0.6 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2" + "GPU_MEMORY_UTILIZATION=0.6 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=1" "GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA case "GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" + "GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=1 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" "DP_EP=1 GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA+P-TP1, D-DPEP=2 (TP=1) + "DP_EP=1 GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA+P-TP2, D-DPEP=2 (TP=1) ) run_tests() { diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index 25f4308079595..20ef566416b8f 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -391,6 +391,8 @@ class FakeNixlConnectorWorker(NixlConnectorWorker): super().__init__(*args, **kwargs) self._hand_shake_latency = hand_shake_latency self.kv_cache_layout = kv_cache_layout + # Mock register_kv_caches attribute needed for tests that do not call it. + self.src_xfer_handles_by_block_size = {self.block_size: 1} def _nixl_handshake( self, host: str, port: int, remote_tp_size: int, expected_engine_id: str @@ -407,22 +409,43 @@ class FakeNixlConnectorWorker(NixlConnectorWorker): assert expected_engine_id == self.REMOTE_ENGINE_ID - remote_agent_name = self.add_remote_agent( - NixlAgentMetadata( - engine_id=self.REMOTE_ENGINE_ID, - agent_metadata=FakeNixlWrapper.AGENT_METADATA, - kv_caches_base_addr=[0], - device_id=0, - num_blocks=1, - block_lens=self.block_len_per_layer, - # `self.kv_cache_layout` is only forced to HND when vllm engine - # is started. We mock HND here. - kv_cache_layout="HND", - block_size=self.block_size, - ), - remote_tp_size=remote_tp_size, - ) - return {0: remote_agent_name} + # Adjust remote block length metadata to satisfy heterogeneous TP + # invariants enforced during handshake validation. + remote_block_lens = list(self.block_len_per_layer) + tp_ratio = self.kv_topo.tp_ratio(remote_tp_size) + if remote_tp_size > self.world_size: + # P TP > D TP case, block_len of remote is smaller + remote_block_lens = [ + block_len // (-tp_ratio) for block_len in remote_block_lens + ] + elif remote_tp_size < self.world_size: + remote_block_lens = [ + block_len * tp_ratio for block_len in remote_block_lens + ] + + # When remote tp_size > local tp_size, handshake with multiple + # remote ranks. + num_hanshakes = 1 if tp_ratio > 0 else -tp_ratio + remote_agents: dict[int, str] = {} + for remote_tp_rank in range(num_hanshakes): + remote_agent_name = self.add_remote_agent( + NixlAgentMetadata( + engine_id=self.REMOTE_ENGINE_ID, + agent_metadata=FakeNixlWrapper.AGENT_METADATA, + kv_caches_base_addr=[0], + device_id=remote_tp_rank, + num_blocks=1, + block_lens=remote_block_lens, + # `self.kv_cache_layout` is only forced to HND when vllm engine + # is started. We mock HND here. + kv_cache_layout="HND", + block_size=self.block_size, + ), + remote_tp_rank=remote_tp_rank, + remote_tp_size=remote_tp_size, + ) + remote_agents[remote_tp_rank] = remote_agent_name + return remote_agents class TestNixlHandshake: @@ -453,7 +476,13 @@ class TestNixlHandshake: vllm_config, connector.engine_id, hand_shake_latency=0 ) assert isinstance(connector.connector_worker.nixl_wrapper, FakeNixlWrapper) - connector.connector_worker.nixl_wrapper.set_cycles_before_xfer_done(3) + worker = connector.connector_worker + worker.nixl_wrapper.set_cycles_before_xfer_done(3) + # simulate handshake + worker.dst_xfer_side_handles = { + FakeNixlConnectorWorker.REMOTE_ENGINE_ID: {0: 1} + } + worker.kv_cache_layout = "HND" num_xfers = 4 while True: # For the same request_id, initiate multiple xfers across different @@ -567,6 +596,171 @@ class TestNixlHandshake: return raise TimeoutError("Took too long to complete async handshake.") + @patch( + "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper", + FakeNixlWrapper, + ) + @pytest.mark.parametrize("local_tp_size", [1, 2]) + def test_prefill_tp_size_greater_than_decode_tp_size( + self, local_tp_size: int, dist_init + ): + """ + Verify remote TP > local TP handshake succeeds with different + remote configurations. + """ + + vllm_config = create_vllm_config() + local_tp_size = 1 + vllm_config.parallel_config.tensor_parallel_size = local_tp_size + + connector = NixlConnector(vllm_config, KVConnectorRole.WORKER) + connector.connector_worker = FakeNixlConnectorWorker( + vllm_config, connector.engine_id, hand_shake_latency=0 + ) + worker = connector.connector_worker + + # Minimal local registration params used by add_remote_agent + worker.slot_size_per_layer = [4096] + worker.block_len_per_layer = [4096 * worker.block_size] + worker.num_blocks = 1 + worker.dst_num_blocks[worker.engine_id] = worker.num_blocks + worker.src_blocks_data = [(0, worker.block_len_per_layer[0], worker.tp_rank)] + + def check_handshake(remote_tp_size: int): + tp_ratio = remote_tp_size // local_tp_size + assert set(remote_agents.keys()) == set(range(tp_ratio)) + + remote_engine_id = worker.REMOTE_ENGINE_ID + assert worker._tp_size[remote_engine_id] == remote_tp_size + assert -tp_ratio == worker.kv_topo.tp_ratio_from_engine_id(remote_engine_id) + # ensure src_xfer_handles_by_tp_ratio is populated with tpratio chunks + assert -tp_ratio in worker.src_xfer_handles_by_tp_ratio + assert len(worker.src_xfer_handles_by_tp_ratio[-tp_ratio]) == tp_ratio + assert remote_engine_id in worker.dst_xfer_side_handles + assert set(worker.dst_xfer_side_handles[remote_engine_id].keys()) == set( + range(tp_ratio) + ) + + remote_agents = worker._nixl_handshake( + host="localhost", + port=1234, + remote_tp_size=2, + expected_engine_id=worker.REMOTE_ENGINE_ID, + ) + check_handshake(2) + + # NOTE flexiblity: a second remote with higher number of ranks is + # discovered. This is not a scenario we actively support right now, but + # the connector allows it. + worker.REMOTE_ENGINE_ID = "remote_engine_2" + remote_agents = worker._nixl_handshake( + host="localhost", + port=1234, + remote_tp_size=6, + expected_engine_id=worker.REMOTE_ENGINE_ID, + ) + check_handshake(6) + + @patch( + "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper", + FakeNixlWrapper, + ) + @pytest.mark.parametrize("local_tp_size", [1, 2]) + def test_prefill_tp_size_greater_than_decode_tp_size_mla( + self, local_tp_size: int, dist_init + ): + """ + Verify remote TP > local TP handshake succeeds with different + remote configurations for an MLA model. + """ + vllm_config = create_vllm_config() + d_tp_size = 1 + p_tp_size = 2 + + # Build two separate connectors/workers to emulate P TP=2 ranks. + conn_p0 = NixlConnector(vllm_config, KVConnectorRole.WORKER) + conn_p1 = NixlConnector(vllm_config, KVConnectorRole.WORKER) + conn_p0.connector_worker = FakeNixlConnectorWorker( + vllm_config, conn_p0.engine_id, hand_shake_latency=0 + ) + conn_p1.connector_worker = FakeNixlConnectorWorker( + vllm_config, conn_p1.engine_id, hand_shake_latency=0 + ) + + # Force P world size to 2 for both workers and emulate distinct tp_ranks. + # Also enable MLA path so that expected_finished_count is updated. + for rank, worker in enumerate( + (conn_p0.connector_worker, conn_p1.connector_worker) + ): + worker.world_size = p_tp_size + worker.kv_topo.remote_tp_size = {worker.engine_id: p_tp_size} + worker.tp_rank = rank + worker.use_mla = True + + req_id = "req-ep-dp2-p0" + now = time.perf_counter() + # Register a request on P that is waiting for consumers to read + # (both workers track it). + conn_p0.connector_worker._reqs_to_send[req_id] = now + 10.0 + conn_p0.connector_worker._reqs_to_process.add(req_id) + conn_p1.connector_worker._reqs_to_send[req_id] = now + 10.0 + conn_p1.connector_worker._reqs_to_process.add(req_id) + + # Simulate a read notification coming from D with (tp=1, dp=2). + notif = f"{req_id}:{d_tp_size}".encode() + # D0-0->P0 notif + conn_p0.connector_worker.nixl_wrapper.get_new_notifs = lambda: { + "agent": [notif] + } # type: ignore[method-assign] + conn_p1.connector_worker.nixl_wrapper.get_new_notifs = lambda: { + "agent": [notif] + } # type: ignore[method-assign] + + # Trigger notification processing via get_finished(). + done_sending0, _ = conn_p0.get_finished(finished_req_ids=set()) + done_sending1, _ = conn_p1.get_finished(finished_req_ids=set()) + assert req_id in done_sending0 and req_id in done_sending1 + + # E2E aggregation: ensure the aggregated output marks the request + # as finished using the connector's expected_finished_count. + from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput + + aggregator = KVOutputAggregator.from_connector(conn_p0, world_size=2) + + out0 = ModelRunnerOutput( + req_ids=[req_id], + req_id_to_index={req_id: 0}, + sampled_token_ids=[[0]], + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[None], + kv_connector_output=KVConnectorOutput( + finished_sending=done_sending0, + finished_recving=None, + ), + ) + out1 = ModelRunnerOutput( + req_ids=[req_id], + req_id_to_index={req_id: 0}, + sampled_token_ids=[[0]], + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[None], + kv_connector_output=KVConnectorOutput( + finished_sending=done_sending1, + finished_recving=None, + ), + ) + aggregated = aggregator.aggregate([out0, out1], output_rank=0) + assert aggregated.kv_connector_output is not None + assert aggregated.kv_connector_output.finished_sending == {req_id} + + # Producers cleaned up state for the finished request. + assert req_id not in conn_p0.connector_worker._reqs_to_send + assert req_id not in conn_p0.connector_worker._reqs_to_process + assert req_id not in conn_p1.connector_worker._reqs_to_send + assert req_id not in conn_p1.connector_worker._reqs_to_process + @patch( "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper", FakeNixlWrapper, @@ -585,6 +779,9 @@ class TestNixlHandshake: connector.connector_worker = FakeNixlConnectorWorker( vllm_config, connector.engine_id ) + # Register (mocked) local xfer handler + # worker = connector.connector_worker + # worker.src_xfer_handles_by_block_size = {worker.block_size: 1} metadata = NixlConnectorMetadata() total_reqs = 5 for i in range(total_reqs): @@ -672,7 +869,6 @@ class TestNixlHandshake: with pytest.raises(RuntimeError): # mismatched layout is expected to fail worker.add_remote_agent(meta, remote_tp_size=2) - with pytest.raises(AssertionError): worker.add_remote_agent(meta, remote_tp_size=1) @patch( @@ -1357,8 +1553,11 @@ def test_shutdown_cleans_up_resources(dist_init): patch.object(nixl_wrapper, "deregister_memory") as mock_dereg, ): worker._recving_transfers = {"req1": [123]} - worker.src_xfer_side_handle = 456 - worker.dst_xfer_side_handles = {"engine1": 789} + # Mock register_kv_cache which registers local handle + worker.src_xfer_handles_by_block_size = {worker.block_size: 455} + # P TP = 2 * D TP case, we should register 2 local handles + worker.src_xfer_handles_by_tp_ratio = {-2: [456, 457]} + worker.dst_xfer_side_handles = {"engine1": {0: 789}} worker._remote_agents = {"engine1": {0: "agent1"}} worker._registered_descs = ["desc1", "desc2"] @@ -1379,8 +1578,10 @@ def test_shutdown_cleans_up_resources(dist_init): mock_listener.join.assert_called_once() mock_rel_xfer.assert_called_once_with(123) - assert mock_rel_dlist.call_count == 2 - mock_rel_dlist.assert_any_call(456) # src handle + assert mock_rel_dlist.call_count == 4 + mock_rel_dlist.assert_any_call(455) # src handle (whole region) + mock_rel_dlist.assert_any_call(456) # src handle (1st chunk) + mock_rel_dlist.assert_any_call(457) # src handle (2nd chunk) mock_rel_dlist.assert_any_call(789) # dst handle mock_rem_agent.assert_called_once_with("agent1") assert mock_dereg.call_count == 2 diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py index 117d159e25e71..a026cccb85372 100644 --- a/vllm/distributed/kv_transfer/kv_connector/utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/utils.py @@ -21,6 +21,8 @@ if TYPE_CHECKING: logger = init_logger(__name__) +EngineId = str + def get_kv_connector_cache_layout(): # NOTE (NickLucche) When running disaggregated PD with NIXL, HND layout is @@ -209,12 +211,12 @@ class TpKVTopology: """ tp_rank: int - remote_tp_size: dict[str, int] + remote_tp_size: dict[EngineId, int] is_mla: bool total_num_kv_heads: int attn_backend: type[AttentionBackend] - engine_id: str - remote_block_size: dict[str, int] + engine_id: EngineId + remote_block_size: dict[EngineId, int] def __post_init__(self): # Figure out whether the first dimension of the cache is K/V @@ -256,18 +258,28 @@ class TpKVTopology: Calculate the tensor parallel ratio between local and remote TP. We can think of it as the number of local TP workers-per-remote TP workers. Local workers will read from the same remote TP worker in - groups of size `tp_ratio`. + groups of size `tp_ratio`.If remote tp_size > local tp_size, the + ratio is flipped (remote_size/local_size) and the returned value is + negative. """ - assert self.tp_size % remote_tp_size == 0, ( - f"Local tensor parallel size {self.tp_size} is not divisible " - f"by remote tensor parallel size {remote_tp_size}." + if self.tp_size >= remote_tp_size: + assert self.tp_size % remote_tp_size == 0, ( + f"Local tensor parallel size {self.tp_size} is not divisible " + f"by remote tensor parallel size {remote_tp_size}." + ) + return self.tp_size // remote_tp_size + + assert remote_tp_size % self.tp_size == 0, ( + f"Remote tensor parallel size {remote_tp_size} is not divisible " + f"by local tensor parallel size {self.tp_size}." ) - return self.tp_size // remote_tp_size + # P TP > D TP case, return the ratio as negative + return -remote_tp_size // self.tp_size def block_size_ratio( self, remote_block_size: int, - ) -> float: + ) -> int: """ Calculate the block size ratio between local and remote TP. """ @@ -279,19 +291,19 @@ class TpKVTopology: def tp_ratio_from_engine_id( self, - remote_engine_id: str, + remote_engine_id: EngineId, ) -> int: remote_tp_size = self.remote_tp_size[remote_engine_id] return self.tp_ratio(remote_tp_size) def block_size_ratio_from_engine_id( self, - remote_engine_id: str, - ) -> float: + remote_engine_id: EngineId, + ) -> int: remote_block_size = self.remote_block_size[remote_engine_id] return self.block_size_ratio(remote_block_size) - def is_kv_replicated(self, engine_id: str) -> bool: + def is_kv_replicated(self, engine_id: EngineId) -> bool: """ Whether the KV cache is replicated across TP workers due to the number of TP workers being greater than the number of KV heads. @@ -299,24 +311,30 @@ class TpKVTopology: tp_size = self.remote_tp_size[engine_id] return tp_size // self.total_num_kv_heads >= 1 - def replicates_kv_cache(self, remote_engine_id: str) -> bool: + def replicates_kv_cache(self, remote_engine_id: EngineId) -> bool: # MLA is always replicated as the hidden dim can't be split. return self.is_mla or self.is_kv_replicated(remote_engine_id) - def get_target_remote_rank( + def get_target_remote_ranks( self, remote_tp_size: int, - ) -> int: + ) -> list[int]: """ Get the remote TP rank (on P) that the current local TP rank - (on D) will read from. + (on D) will read from. When remote tp_size > local tp_size, we + read from multiple remote ranks. """ tp_ratio = self.tp_ratio(remote_tp_size) - return self.tp_rank // tp_ratio + if tp_ratio > 0: + return [self.tp_rank // tp_ratio] - def get_target_remote_rank_from_engine_id( + # P TP > D TP case, D reads from |tp_ratio| remote workers. + tp_ratio = -tp_ratio + return [self.tp_rank * tp_ratio + i for i in range(tp_ratio)] + + def get_target_remote_ranks_from_engine_id( self, - remote_engine_id: str, - ) -> int: + remote_engine_id: EngineId, + ) -> list[int]: remote_tp_size = self.remote_tp_size[remote_engine_id] - return self.get_target_remote_rank(remote_tp_size) + return self.get_target_remote_ranks(remote_tp_size) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index fb4b8ac391afb..be56eb4e93c10 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -23,7 +23,7 @@ from vllm import envs from vllm.attention.backends.abstract import AttentionMetadata from vllm.attention.selector import get_attn_backend from vllm.config import VllmConfig -from vllm.distributed.kv_transfer.kv_connector.utils import TpKVTopology +from vllm.distributed.kv_transfer.kv_connector.utils import EngineId, TpKVTopology from vllm.distributed.kv_transfer.kv_connector.v1.base import ( CopyBlocksOp, KVConnectorBase_V1, @@ -56,7 +56,6 @@ if TYPE_CHECKING: from vllm.v1.request import Request TransferHandle = int -EngineId = str ReqId = str # @@ -873,9 +872,10 @@ class NixlConnectorWorker: self.copy_blocks: CopyBlocksOp | None = None # Map of engine_id -> kv_caches_base_addr. For TP case, each local - # rank will still only pull from a single remote TP worker. - self.kv_caches_base_addr: dict[EngineId, list[int]] = {} self.device_id: int = 0 + # Current rank may pull from multiple remote TP workers. + # EngineId, dict[int, list[int]] -> engine_id, tp_rank, base_addr_for_layer + self.kv_caches_base_addr = defaultdict[EngineId, dict[int, list[int]]](dict) # Number of NIXL regions. Currently one region per cache # (so 1 per layer for MLA, otherwise 2 per layer) @@ -883,10 +883,12 @@ class NixlConnectorWorker: self.num_layers = 0 # nixl_prepped_dlist_handle. - self.src_xfer_side_handle: int = 0 - self.src_xfer_side_handles: dict[int, int] = {} - # Map of engine_id -> nixl_prepped_dlist_handle (int)]. - self.dst_xfer_side_handles: dict[EngineId, int] = {} + self.src_xfer_handles_by_block_size: dict[int, int] = {} + # Populated dynamically during handshake based on remote configuration. + # Keep track of regions at different tp_ratio values. tp_ratio->handles + self.src_xfer_handles_by_tp_ratio: dict[int, list[int]] = {} + # Map of engine_id -> {tp_rank: nixl_prepped_dlist_handle (int)}. + self.dst_xfer_side_handles = defaultdict[EngineId, dict[int, int]](dict) # Map of engine_id -> num_blocks. All ranks in the same deployment will # have the same number of blocks. @@ -977,103 +979,108 @@ class NixlConnectorWorker: expected_engine_id: str, ) -> dict[int, str]: """Do a NIXL handshake with a remote instance.""" - - start_time = time.perf_counter() - - # NOTE(rob): we need each rank to have a unique port. This is - # a hack to keep us moving. We will switch when moving to etcd - # or where we have a single ZMQ socket in the scheduler. - - # Handshake only with the remote TP rank that current local rank will - # pull from. With homogeneous TP it happens to be the same rank_i. - p_remote_rank = self.kv_topo.get_target_remote_rank(remote_tp_size) + # When target instance TP > local TP, we need to perform multiple + # handshakes. Do it in a single background job for simplicity. + # Regardless, only handshake with the remote TP rank(s) that current + # local rank will read from. Note that With homogeneous TP, + # this happens to be the same single rank_i. + p_remote_ranks = self.kv_topo.get_target_remote_ranks(remote_tp_size) + remote_rank_to_agent_name = {} path = make_zmq_path("tcp", host, port) - logger.debug( - "Querying metadata on path: %s at remote tp rank %s", path, p_remote_rank - ) - # Send query for the request. with zmq_ctx(zmq.REQ, path) as sock: - msg = msgspec.msgpack.encode((GET_META_MSG, p_remote_rank)) - # Set receive timeout to 5 seconds to avoid hanging on dead server - sock.setsockopt(zmq.RCVTIMEO, 5000) # milliseconds - sock.send(msg) - handshake_bytes = sock.recv() - - # Decode handshake payload to get compatibility hash - handshake_decoder = msgspec.msgpack.Decoder(NixlHandshakePayload) - try: - handshake_payload = handshake_decoder.decode(handshake_bytes) - except (msgspec.DecodeError, msgspec.ValidationError) as e: - raise RuntimeError( - f"Failed to decode NixlHandshakePayload. This likely indicates " - f"an incompatibility between connector version. Error: {e}" - ) from e - - got_metadata_time = time.perf_counter() - logger.debug( - "NIXL handshake: get metadata took: %s", got_metadata_time - start_time - ) - - # Check compatibility hash BEFORE decoding agent metadata - if ( - self.enforce_compat_hash - and handshake_payload.compatibility_hash != self.compat_hash - ): - raise RuntimeError( - f"NIXL compatibility hash mismatch. " - f"Local: {self.compat_hash}, " - f"Remote: {handshake_payload.compatibility_hash}. " - f"Prefill and decode instances have incompatible configurations. " - f"This may be due to: different vLLM versions, models, dtypes, " - f"KV cache layouts, attention backends, etc. " - f"Both instances must use identical configurations." - f"Disable this check using " - f'--kv-transfer-config \'{{"kv_connector_extra_config": ' - f'{{"enforce_handshake_compat": false}}}}\'' + for remote_rank in p_remote_ranks: + logger.debug( + "Querying metadata on path: %s at remote tp rank %s", + path, + remote_rank, ) - logger.info( - "NIXL compatibility check passed (hash: %s)", - handshake_payload.compatibility_hash, - ) + start_time = time.perf_counter() + # Send query for the request. + msg = msgspec.msgpack.encode((GET_META_MSG, remote_rank)) + # Set receive timeout to 5 seconds to avoid hanging on dead server + sock.setsockopt(zmq.RCVTIMEO, 5000) # milliseconds + sock.send(msg) + handshake_bytes = sock.recv() - # Decode agent metadata - metadata_decoder = msgspec.msgpack.Decoder(NixlAgentMetadata) - try: - metadata = metadata_decoder.decode( - handshake_payload.agent_metadata_bytes - ) - except (msgspec.DecodeError, msgspec.ValidationError) as e: - # This should not happen if hash matched - raise RuntimeError( - f"Failed to decode NixlAgentMetadata. Error: {e}" - ) from e + # Decode handshake payload to get compatibility hash + handshake_decoder = msgspec.msgpack.Decoder(NixlHandshakePayload) + try: + handshake_payload = handshake_decoder.decode(handshake_bytes) + except (msgspec.DecodeError, msgspec.ValidationError) as e: + raise RuntimeError( + f"Failed to decode NixlHandshakePayload. This likely indicates " + f"an incompatibility between connector version. Error: {e}" + ) from e - # Ensure engine id matches. - if metadata.engine_id != expected_engine_id: - raise RuntimeError( - f"Remote NIXL agent engine ID mismatch. " - f"Expected {expected_engine_id}," - f"received {metadata.engine_id}." + got_metadata_time = time.perf_counter() + logger.debug( + "NIXL handshake: get metadata took: %s", + got_metadata_time - start_time, ) - # Register Remote agent. - assert metadata.block_size <= self.block_size, ( - "nP > nD is not supported yet." - ) - remote_agent_name = self.add_remote_agent( - metadata, p_remote_rank, remote_tp_size - ) + # Check compatibility hash BEFORE decoding agent metadata + if ( + self.enforce_compat_hash + and handshake_payload.compatibility_hash != self.compat_hash + ): + raise RuntimeError( + f"NIXL compatibility hash mismatch. " + f"Local: {self.compat_hash}, " + f"Remote: {handshake_payload.compatibility_hash}. " + f"Prefill and decode instances have incompatible " + f"configurations. This may be due to: different vLLM versions," + f" models, dtypes, KV cache layouts, attention backends, etc. " + f"Both instances must use identical configurations." + f"Disable this check using " + f'--kv-transfer-config \'{{"kv_connector_extra_config": ' + f'{{"enforce_handshake_compat": false}}}}\'' + ) - setup_agent_time = time.perf_counter() - logger.debug( - "NIXL handshake: add agent took: %s", - setup_agent_time - got_metadata_time, - ) + logger.info( + "NIXL compatibility check passed (hash: %s)", + handshake_payload.compatibility_hash, + ) - # Remote rank -> agent name. - return {p_remote_rank: remote_agent_name} + # Decode agent metadata + metadata_decoder = msgspec.msgpack.Decoder(NixlAgentMetadata) + try: + metadata = metadata_decoder.decode( + handshake_payload.agent_metadata_bytes + ) + except (msgspec.DecodeError, msgspec.ValidationError) as e: + # This should not happen if hash matched + raise RuntimeError( + f"Failed to decode NixlAgentMetadata. Error: {e}" + ) from e + + # Ensure engine id matches. + if metadata.engine_id != expected_engine_id: + raise RuntimeError( + f"Remote NIXL agent engine ID mismatch. " + f"Expected {expected_engine_id}," + f"received {metadata.engine_id}." + ) + # Ensure engine id matches. + if metadata.engine_id != expected_engine_id: + raise RuntimeError( + f"Remote NIXL agent engine ID mismatch. " + f"Expected {expected_engine_id}," + f"received {metadata.engine_id}." + ) + setup_agent_time = time.perf_counter() + + # Register Remote agent. + remote_agent_name = self.add_remote_agent( + metadata, remote_rank, remote_tp_size + ) + logger.debug( + "NIXL handshake: add agent took: %s", + setup_agent_time - got_metadata_time, + ) + remote_rank_to_agent_name[remote_rank] = remote_agent_name + return remote_rank_to_agent_name def initialize_host_xfer_buffer(self, kv_caches: dict[str, torch.Tensor]) -> None: """ @@ -1283,7 +1290,7 @@ class NixlConnectorWorker: assert len(self.block_len_per_layer) == len(seen_base_addresses) assert self.num_blocks != 0 - self.kv_caches_base_addr[self.engine_id] = seen_base_addresses + self.kv_caches_base_addr[self.engine_id][self.tp_rank] = seen_base_addresses self.num_regions = len(caches_data) self.num_layers = len(xfer_buffers.keys()) @@ -1310,9 +1317,9 @@ class NixlConnectorWorker: # Register local/src descr for NIXL xfer. self.seen_base_addresses = seen_base_addresses - self.src_xfer_side_handle = self.register_local_xfer_handler(self.block_size) - - self.src_xfer_side_handles[self.block_size] = self.src_xfer_side_handle + self.src_xfer_handles_by_block_size[self.block_size], self.src_blocks_data = ( + self.register_local_xfer_handler(self.block_size) + ) # TODO(mgoin): Hybrid memory allocator is currently disabled for # models with local attention (Llama 4). Can remove this once enabled. @@ -1340,8 +1347,8 @@ class NixlConnectorWorker: agent_metadata = NixlAgentMetadata( engine_id=self.engine_id, agent_metadata=self.nixl_wrapper.get_agent_metadata(), - kv_caches_base_addr=self.kv_caches_base_addr[self.engine_id], device_id=self.device_id, + kv_caches_base_addr=self.kv_caches_base_addr[self.engine_id][self.tp_rank], num_blocks=self.num_blocks, block_lens=self.block_len_per_layer, kv_cache_layout=self.kv_cache_layout @@ -1359,7 +1366,7 @@ class NixlConnectorWorker: def register_local_xfer_handler( self, block_size: int, - ) -> int: + ) -> tuple[int, list[tuple[int, int, int]]]: """ Function used for register local xfer handler with local block_size or Remote block_size. @@ -1407,7 +1414,7 @@ class NixlConnectorWorker: descs = self.nixl_wrapper.get_xfer_descs(blocks_data, self.nixl_memory_type) # NIXL_INIT_AGENT to be used for preparations of local descs. - return self.nixl_wrapper.prep_xfer_dlist("NIXL_INIT_AGENT", descs) + return self.nixl_wrapper.prep_xfer_dlist("NIXL_INIT_AGENT", descs), blocks_data def add_remote_agent( self, @@ -1421,10 +1428,12 @@ class NixlConnectorWorker: In particular, handle both homogeneous and heterogeneous TP. The former requires local rank_i to read from remote rank_i. - The latter, assuming D.world_size > P.world_size, requires that two or - more local TP worker share the xfer from a single TP worker. + The latter, in the case of D.world_size < P.world_size, requires that a + local (D) TP worker reads from multiple remote (P) TP workers. + Conversely, assuming D.world_size > P.world_size, two or more local TP + workers will read from a single remote TP worker. - Here's an example (non-MLA case): + Here's an example for the last case described above (non-MLA): rank_offset p_remote_tp_rank (kv split no) @@ -1474,9 +1483,6 @@ class NixlConnectorWorker: nixl_agent_meta.agent_metadata ) - # Handle tp_size>num_kv_heads: replicate KV cache. - replicates_kv_cache = self.kv_topo.replicates_kv_cache(engine_id) - # Create dst descs and xfer side handles. TP workers have same #blocks # so we only register once per engine_id. # Example: @@ -1490,14 +1496,52 @@ class NixlConnectorWorker: self.dst_num_blocks[engine_id] = nixl_agent_meta.num_blocks # Keep track of remote agent kv caches base addresses. - self.kv_caches_base_addr[engine_id] = nixl_agent_meta.kv_caches_base_addr - + self.kv_caches_base_addr[engine_id][remote_tp_rank] = ( + nixl_agent_meta.kv_caches_base_addr + ) self._validate_remote_agent_handshake(nixl_agent_meta, remote_tp_size) - # Number of D TP workers reading from a single P TP worker. This is - # 1 when P and D `--tensor-parallel-size` match. + # This is 1 when P and D `--tensor-parallel-size` match. Otherwise, + # this is the ratio between the two sizes. tp_ratio = self.kv_topo.tp_ratio_from_engine_id(engine_id) + # Handle tp_size>num_kv_heads: replicate KV cache. + indexes_into_remote = ( + not self.kv_topo.replicates_kv_cache(engine_id) and tp_ratio > 0 + ) + + logger.debug( + "Registering remote agent (%s, rank %s) memory regions with tp_ratio %s", + engine_id, + remote_tp_rank, + tp_ratio, + ) + + ### (Optional) Register local agent memory regions. MLA is not split. + if ( + tp_ratio < 0 + and not self.use_mla + and tp_ratio not in self.src_xfer_handles_by_tp_ratio + ): + # Remote tp_size > local tp_size: read from multiple remote ranks. + # Logically "split" own regions into |tp_ratio| chunks. Mind that + # we only do this once per remote tp_size (replica-friendly). + self.src_xfer_handles_by_tp_ratio[tp_ratio] = [] + for i in range(-tp_ratio): + blocks_data = [] + for memory_region in self.src_blocks_data: + addr, local_block_len, own_tp_rank = memory_region + # Computing block len layer by layer allows for different + # block sizes to be used. + remote_block_len = local_block_len // (-tp_ratio) + addr = addr + i * remote_block_len + blocks_data.append((addr, remote_block_len, own_tp_rank)) + descs = self.nixl_wrapper.get_xfer_descs( + blocks_data, self.nixl_memory_type + ) + handle = self.nixl_wrapper.prep_xfer_dlist("NIXL_INIT_AGENT", descs) + self.src_xfer_handles_by_tp_ratio[tp_ratio].append(handle) + ### Register remote agent memory regions blocks_data = [] # With homogeneous TP, D pulls the whole kv cache from corresponding @@ -1507,14 +1551,19 @@ class NixlConnectorWorker: # Register all remote blocks, but only the corresponding kv heads. for i, base_addr in enumerate(nixl_agent_meta.kv_caches_base_addr): - kv_block_len = self.get_backend_aware_kv_block_len(layer_idx=i) - remote_kv_block_len = kv_block_len // block_size_ratio + # Read our whole local region size from remote. + local_block_len = self.get_backend_aware_kv_block_len(layer_idx=i) + remote_kv_block_len = local_block_len // block_size_ratio if block_size_ratio > 1: # using remote kv_block_len as transfer unit - kv_block_len = remote_kv_block_len + local_block_len = remote_kv_block_len + + if tp_ratio < 0 and not self.use_mla: + # Remote tp is bigger: read a chunk of local region from remote + local_block_len = local_block_len // (-tp_ratio) rank_offset = ( self.tp_rank % tp_ratio * remote_kv_block_len - if not replicates_kv_cache + if indexes_into_remote else 0 ) for block_id in range(nixl_agent_meta.num_blocks): @@ -1524,7 +1573,7 @@ class NixlConnectorWorker: # self.block_len == remote_block_len//tp_ratio bytes. addr = base_addr + block_offset + rank_offset # (addr, len, device id) - blocks_data.append((addr, kv_block_len, nixl_agent_meta.device_id)) + blocks_data.append((addr, local_block_len, nixl_agent_meta.device_id)) if self.kv_topo.is_kv_layout_blocks_first: # With FlashInfer index V separately to allow head splitting. @@ -1533,7 +1582,7 @@ class NixlConnectorWorker: addr = base_addr + block_offset + rank_offset v_addr = addr + nixl_agent_meta.block_lens[i] // 2 blocks_data.append( - (v_addr, kv_block_len, nixl_agent_meta.device_id) + (v_addr, local_block_len, nixl_agent_meta.device_id) ) logger.debug( @@ -1546,15 +1595,15 @@ class NixlConnectorWorker: # Register with NIXL. descs = self.nixl_wrapper.get_xfer_descs(blocks_data, self.nixl_memory_type) - self.dst_xfer_side_handles[engine_id] = self.nixl_wrapper.prep_xfer_dlist( - remote_agent_name, descs + self.dst_xfer_side_handles[engine_id][remote_tp_rank] = ( + self.nixl_wrapper.prep_xfer_dlist(remote_agent_name, descs) ) if block_size_ratio > 1: # when prefill with smaller block_size, we need to init a # new handler with same block_len to match - self.src_xfer_side_handles[nixl_agent_meta.block_size] = ( - self.register_local_xfer_handler(nixl_agent_meta.block_size) + self.src_xfer_handles_by_block_size[nixl_agent_meta.block_size] = ( + self.register_local_xfer_handler(nixl_agent_meta.block_size)[0] ) return remote_agent_name @@ -1574,7 +1623,9 @@ class NixlConnectorWorker: block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id( remote_engine_id ) - assert tp_ratio > 0, "Decode TP cannot be smaller than prefill TP" + # Num kv_heads > tp_size and P TP > D TP case, not supported + assert not (tp_ratio < 0 and self.kv_topo.is_kv_replicated(remote_engine_id)) + assert not self._use_pallas or tp_ratio == 1, ( "TPU (pallas_v1) DOES NOT support heterogeneous TP yet." ) @@ -1616,17 +1667,29 @@ class NixlConnectorWorker: "All remote layers must have the same block size" ) - assert ( - remote_block_len - == (self.block_len_per_layer[0] * tp_ratio) // block_size_ratio - ), ( - "Remote P worker KV layer cache must be of shape [2, N, " - "local_kv_heads*tp_ratio, block_size, head_dim] and same dtype." - ) + if tp_ratio > 0: + # Remote tp is smaller: remote block_len size is bigger + assert ( + remote_block_len + == (self.block_len_per_layer[0] * tp_ratio) // block_size_ratio + ), ( + "Remote P worker KV layer cache must be of shape [2, N, " + "local_kv_heads*tp_ratio, page_size, head_dim] and same dtype." + ) # noqa: E501 + else: + assert block_size_ratio == 1, ( + "Different local/remote block sizes are not supported when" + " P TP > D TP." + ) + # Remote tp is bigger: remote block_len size is smaller + assert remote_block_len == self.block_len_per_layer[0] // (-tp_ratio), ( + "Remote P worker KV layer cache must be of shape [2, N, " + "local_kv_heads/tp_ratio, page_size, head_dim] and same dtype." + ) # noqa: E501 - # TP workers have same #blocks. + # TP workers that handhshake with same remote have same #blocks. assert self.dst_num_blocks[remote_engine_id] == nixl_agent_meta.num_blocks - + # Same number of regions/~layers. assert len(nixl_agent_meta.kv_caches_base_addr) == len(self.block_len_per_layer) def sync_recved_kv_to_device(self, req_id: str, meta: ReqMeta): @@ -1710,7 +1773,7 @@ class NixlConnectorWorker: ) cache.index_copy_(0, indices, permuted_blocks) - def blocksize_post_process(self, block_ids_per_ratio: dict[float, list[list[int]]]): + def blocksize_post_process(self, block_ids_per_ratio: dict[int, list[list[int]]]): def _process_local_gt_remote(blocks_to_update, block_size_ratio): n_kv_heads, block_size, head_size = blocks_to_update.shape[1:] remote_block_size = block_size // block_size_ratio @@ -1840,7 +1903,7 @@ class NixlConnectorWorker: notified_req_ids: set[str] = set() for notifs in self.nixl_wrapper.get_new_notifs().values(): for notif in notifs: - req_id, tp_ratio = notif.decode("utf-8").rsplit(":", 1) + req_id, tp_size = notif.decode("utf-8").rsplit(":", 1) if ( req_id not in self._reqs_to_send and req_id not in self._reqs_to_process @@ -1853,9 +1916,22 @@ class NixlConnectorWorker: ) continue + # NOTE: `tp_ratio` is the opposite when swapping local<>remote + n_consumers = int(tp_size) + tp_ratio = self.kv_topo.tp_ratio(n_consumers) + + # Number of reads *per producer* to wait for. + # When remote D TP > local P TP we expect `tp_ratio` reads. + consumers_per_producer = ( + -tp_ratio if n_consumers > self.world_size else 1 + ) + self.consumer_notification_counts_by_req[req_id] += 1 # Wait all consumers (D) to be done reading before freeing. - if self.consumer_notification_counts_by_req[req_id] == int(tp_ratio): + if ( + self.consumer_notification_counts_by_req[req_id] + == consumers_per_producer + ): notified_req_ids.add(req_id) del self.consumer_notification_counts_by_req[req_id] self._reqs_to_process.remove(req_id) @@ -1872,7 +1948,7 @@ class NixlConnectorWorker: """ done_req_ids: set[str] = set() for req_id, handles in list(transfers.items()): - in_progress = False + in_progress = [] for handle in handles: try: xfer_state = self.nixl_wrapper.check_xfer_state(handle) @@ -1882,7 +1958,7 @@ class NixlConnectorWorker: self.xfer_stats.record_transfer(res) self.nixl_wrapper.release_xfer_handle(handle) elif xfer_state == "PROC": - in_progress = True + in_progress.append(handle) continue else: logger.error( @@ -1892,7 +1968,6 @@ class NixlConnectorWorker: xfer_state, ) self._handle_failed_transfer(req_id, handle) - in_progress = False except Exception: logger.exception( "NIXL transfer exception for request %s. " @@ -1900,11 +1975,13 @@ class NixlConnectorWorker: req_id, ) self._handle_failed_transfer(req_id, handle) - in_progress = False if not in_progress: + # Only report request as completed when all transfers are done. done_req_ids.add(req_id) del transfers[req_id] + else: + transfers[req_id] = in_progress return done_req_ids def _handle_failed_transfer(self, req_id: str, handle: int): @@ -1982,18 +2059,62 @@ class NixlConnectorWorker: def _read_blocks_for_req(self, req_id: str, meta: ReqMeta): assert meta.remote is not None - logger.debug( - "Remote agent %s available, calling _read_blocks for req %s", - meta.remote.engine_id, - req_id, - ) - self._read_blocks( - request_id=req_id, - dst_engine_id=meta.remote.engine_id, - remote_request_id=meta.remote.request_id, - local_block_ids=meta.local_physical_block_ids, - remote_block_ids=meta.remote.block_ids, + remote_ranks = self.kv_topo.get_target_remote_ranks_from_engine_id( + meta.remote.engine_id ) + tp_ratio = self.kv_topo.tp_ratio_from_engine_id(meta.remote.engine_id) + # D may have to perform multiple reads from different remote ranks. + for i, remote_rank in enumerate(remote_ranks): + if self.use_mla and tp_ratio < 0 and i > 0: + # MLA opt: when P TP > D TP, only a single read is executed for + # the first remote rank (cache is duplicated).. + break + + remote_block_size = self.kv_topo.remote_block_size[meta.remote.engine_id] + logger.debug( + "Remote agent %s available, calling _read_blocks" + " on remote rank %s with remote block size %s for req %s", + meta.remote.engine_id, + remote_rank, + remote_block_size, + req_id, + ) + # Get side handles. + if tp_ratio < 0 and not self.use_mla: + assert remote_block_size == self.block_size + # Remote tp_size > local tp_size: we must perform multiple + # reads. Get the memory chunk onto which we will write to. + local_xfer_side_handle = self.src_xfer_handles_by_tp_ratio[tp_ratio][i] + else: + # Single read from remote, we write to the whole memory region. + # Also handle remote block size different from local block size. + local_xfer_side_handle = self.src_xfer_handles_by_block_size[ + remote_block_size + ] + + # Destination handle: remote_engine_id -> remote_rank -> handle. + remote_xfer_side_handle = self.dst_xfer_side_handles[meta.remote.engine_id][ + remote_rank + ] + self._read_blocks( + request_id=req_id, + dst_engine_id=meta.remote.engine_id, + remote_request_id=meta.remote.request_id, + local_block_ids=meta.local_physical_block_ids, + remote_block_ids=meta.remote.block_ids, + remote_rank=remote_rank, + local_xfer_side_handle=local_xfer_side_handle, + remote_xfer_side_handle=remote_xfer_side_handle, + ) + + if self.use_mla and tp_ratio < 0: + # ..but we still need to notify the other remote ranks that we + # have the blocks we need so they can update the request state. + notif_id = f"{req_id}:{self.world_size}".encode() + remote_agents = self._remote_agents[meta.remote.engine_id] + for rank_to_notify, agent in remote_agents.items(): + if rank_to_notify != remote_rank: + self.nixl_wrapper.send_notif(agent, notif_msg=notif_id) def _read_blocks( self, @@ -2002,7 +2123,14 @@ class NixlConnectorWorker: dst_engine_id: str, request_id: str, remote_request_id: str, + remote_rank: int, + local_xfer_side_handle: int, + remote_xfer_side_handle: int, ): + """ + Post a READ point-to-point xfer request from a single local worker to + a single remote worker. + """ block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(dst_engine_id) if block_size_ratio > 1: local_block_ids = self.get_mapped_blocks( @@ -2031,18 +2159,14 @@ class NixlConnectorWorker: # saturate IB with heterogeneous TP sizes. We should remove the staging # blocks until we are ready. - # Number of D TP workers that will read from dst P. Propagate tp_ratio + # Number of D TP workers that will read from dst P. Propagate info # on notification so that dst worker can wait before freeing blocks. - tp_ratio = self.kv_topo.tp_ratio_from_engine_id(dst_engine_id) - notif_id = f"{remote_request_id}:{tp_ratio}".encode() + notif_id = f"{remote_request_id}:{self.world_size}".encode() # Full prefix cache hit: do not need to read remote blocks, # just notify P worker that we have the blocks we need. num_local_blocks = len(local_block_ids) if num_local_blocks == 0: - remote_rank = self.kv_topo.get_target_remote_rank_from_engine_id( - dst_engine_id - ) agent_name = self._remote_agents[dst_engine_id][remote_rank] try: self.nixl_wrapper.send_notif(agent_name, notif_msg=notif_id) @@ -2062,13 +2186,6 @@ class NixlConnectorWorker: if num_local_blocks < num_remote_blocks: remote_block_ids = remote_block_ids[-num_local_blocks:] - # Get side handles. - remote_block_size = self.kv_topo.remote_block_size[dst_engine_id] - local_xfer_side_handle = self.src_xfer_side_handles.get( - remote_block_size, self.src_xfer_side_handle - ) - remote_xfer_side_handle = self.dst_xfer_side_handles[dst_engine_id] - # NOTE (nicolo) With homogeneous TP, each TP worker loads KV from # corresponding rank. With heterogeneous TP, fixing D>P, the D tp # workers will issue xfers to parts of the P worker remote kv caches. @@ -2230,7 +2347,7 @@ class NixlConnectorWorker: block_ids_np, self._physical_blocks_per_logical_kv_block, block_arange ).tolist() - def get_backend_aware_kv_block_len(self, layer_idx: int): + def get_backend_aware_kv_block_len(self, layer_idx: int) -> int: """ Get the block length for one K/V element (K and V have the same size). @@ -2276,11 +2393,16 @@ class NixlConnectorWorker: for handle in handles: self.nixl_wrapper.release_xfer_handle(handle) self._recving_transfers.clear() - if self.src_xfer_side_handle: - self.nixl_wrapper.release_dlist_handle(self.src_xfer_side_handle) - self.src_xfer_side_handle = 0 - for dst_xfer_side_handle in self.dst_xfer_side_handles.values(): - self.nixl_wrapper.release_dlist_handle(dst_xfer_side_handle) + for handle in self.src_xfer_handles_by_block_size.values(): + self.nixl_wrapper.release_dlist_handle(handle) + self.src_xfer_handles_by_block_size.clear() + for handles in self.src_xfer_handles_by_tp_ratio.values(): + for handle in handles: + self.nixl_wrapper.release_dlist_handle(handle) + self.src_xfer_handles_by_tp_ratio.clear() + for dst_xfer_side_handles in self.dst_xfer_side_handles.values(): + for dst_xfer_side_handle in dst_xfer_side_handles.values(): + self.nixl_wrapper.release_dlist_handle(dst_xfer_side_handle) self.dst_xfer_side_handles.clear() for remote_agents in self._remote_agents.values(): for agent_name in remote_agents.values(): From 6fe58876528751a808c0d25016d3c97b432b909f Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Thu, 18 Dec 2025 11:54:39 +0800 Subject: [PATCH 040/176] [Chore] Remove v0 dead code for Qwen2.5-omni (#30883) Signed-off-by: Isotr0py --- .../models/qwen2_5_omni_thinker.py | 22 ------------------- 1 file changed, 22 deletions(-) diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index f9bce4bf981b2..94deeb867c9f8 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -70,7 +70,6 @@ from vllm.multimodal.inputs import ( MultiModalFeatureSpec, MultiModalFieldConfig, MultiModalKwargsItems, - NestedTensors, ) from vllm.multimodal.parse import ( AudioProcessorItems, @@ -1150,27 +1149,6 @@ class Qwen2_5OmniThinkerForConditionalGeneration( handle_oov_mm_token=handle_oov_mm_token, ) - def embed_multimodal_v0(self, **kwargs: object) -> NestedTensors | None: - audio_input = self._parse_and_validate_audio_input(**kwargs) - image_input = self._parse_and_validate_image_input(**kwargs) - video_input = self._parse_and_validate_video_input(**kwargs) - - if audio_input is None and image_input is None and video_input is None: - return None - - multimodal_embeddings: list[tuple[NestedTensors, str]] = [] - - if audio_input is not None: - audio_embeds = self._process_audio_input(audio_input) - multimodal_embeddings.append((audio_embeds, "audio")) - if image_input is not None: - image_embeds = self._process_image_input(image_input) - multimodal_embeddings.append((image_embeds, "image")) - if video_input is not None: - video_embeds = self._process_video_input(video_input) - multimodal_embeddings.append((video_embeds, "video")) - return multimodal_embeddings - def forward( self, input_ids: torch.Tensor, From 5a3adf581e372c60d6135a535561f4d491c4d046 Mon Sep 17 00:00:00 2001 From: gnovack Date: Wed, 17 Dec 2025 19:55:00 -0800 Subject: [PATCH 041/176] fused_moe_lora PDL improvements (#30716) Signed-off-by: gnovack Co-authored-by: Cyrus Leung Co-authored-by: Jee Jee Li --- vllm/lora/ops/triton_ops/fused_moe_lora_op.py | 30 +++++++++++-------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py index 34383cdf1767c..f04936221eea6 100644 --- a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py +++ b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py @@ -156,16 +156,22 @@ def _fused_moe_lora_kernel( + offs_bn[None, :] * stride_bn ) + if USE_GDC and IS_PRIMARY: + # GDC launch dependents hints the runtime system to launch dependent kernels. + tl.extra.cuda.gdc_launch_dependents() + # accumulator accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # GDC wait waits for ALL programs in the prior kernel to complete + # before continuing. + if USE_GDC and not IS_PRIMARY: + tl.extra.cuda.gdc_wait() + for k in range(0, grid_k): k_remaining = K - k * (BLOCK_SIZE_K * SPLIT_K) # pre-fetch lora weight b = tl.load(b_ptrs, mask=offs_k[:, None] < k_remaining, other=0.0) - # GDC wait waits for ALL programs in the prior kernel to complete - # before continuing. - if USE_GDC and not IS_PRIMARY: - tl.extra.cuda.gdc_wait() a = tl.load( a_ptrs, mask=token_mask[:, None] & (offs_k[None, :] < k_remaining), @@ -179,9 +185,6 @@ def _fused_moe_lora_kernel( if MUL_ROUTED_WEIGHT: moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0) accumulator = accumulator * moe_weight[:, None] - if USE_GDC and IS_PRIMARY: - # GDC launch dependents hints the runtime system to launch dependent kernels. - tl.extra.cuda.gdc_launch_dependents() accumulator = accumulator.to(c_ptr.dtype.element_ty) # Write back the block of the output offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) @@ -290,6 +293,7 @@ def _fused_moe_lora_shrink( def _fused_moe_lora_expand( output: torch.Tensor, # (num_tokens, top_k_num, N*len(lora_a_stacked),) a_intermediate_cache1: torch.Tensor, # (num_slices, M, top_k_num, max_lora_rank) + b_intermediate_cache1: torch.Tensor, # (num_slices, M, top_k_num, output_dim_size) lora_b_stacked: list[ torch.Tensor ], # [(max_loras, num_experts, max_lora_rank, K,),...] @@ -331,11 +335,6 @@ def _fused_moe_lora_expand( -1, a_intermediate_cache1.shape[3] ) - b_intermediate_cache1 = torch.zeros( - (num_slices, M, top_k_num, w1_output_dim_size), - dtype=output.dtype, - device=device, - ) use_gdc = supports_pdl(a_intermediate_cache1.device) expand_config = { "BLOCK_SIZE_M": block_size_m, @@ -460,6 +459,12 @@ def _fused_moe_lora( device=device, ) + b_intermediate_cache1 = torch.zeros( + (num_slices, M, top_k_num, w1_output_dim_size), + dtype=output.dtype, + device=device, + ) + _fused_moe_lora_shrink( a_intermediate_cache1, qcurr_hidden_states, @@ -506,6 +511,7 @@ def _fused_moe_lora( _fused_moe_lora_expand( output, a_intermediate_cache1, + b_intermediate_cache1, lora_b_stacked, topk_weights, sorted_token_ids, From 0c738b58bc0e5a5bf2448c95fc2014b83127a4d5 Mon Sep 17 00:00:00 2001 From: Bowen Bao Date: Wed, 17 Dec 2025 20:20:42 -0800 Subject: [PATCH 042/176] [Quantization] Support Quark int4-fp8 w4a8 for MoE (#30071) Signed-off-by: Bowen Bao --- .../layers/quantization/quark/quark.py | 43 +++++ .../layers/quantization/quark/quark_moe.py | 160 +++++++++++++++++- 2 files changed, 201 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py index 3640e5c452786..39bcd56bcd3dc 100644 --- a/vllm/model_executor/layers/quantization/quark/quark.py +++ b/vllm/model_executor/layers/quantization/quark/quark.py @@ -218,6 +218,49 @@ class QuarkConfig(QuantizationConfig): else: return False + def _is_fp8_w4a8( + self, + weight_quant: list[dict[str, Any]] | None, + input_quant: dict[str, Any] | None, + ) -> bool: + # Confirm weights and input quantized. + if weight_quant is None or input_quant is None: + return False + + if not isinstance(weight_quant, list) or len(weight_quant) != 2: + return False + + # Confirm weight scheme is supported + is_w4a8_dtype = ( + weight_quant[0].get("dtype") == "fp8_e4m3" + and weight_quant[1].get("dtype") == "int4" + and input_quant.get("dtype") == "fp8_e4m3" + ) + is_static_weight = not weight_quant[0].get("is_dynamic") and not weight_quant[ + 1 + ].get("is_dynamic") + is_per_tensor_fp8_and_per_channel_int4_weight = ( + weight_quant[0].get("qscheme") == "per_tensor" + and weight_quant[1].get("qscheme") == "per_channel" + and weight_quant[1].get("symmetric") is True + and weight_quant[1].get("ch_axis") == 0 + ) + + if not ( + is_w4a8_dtype + and is_static_weight + and is_per_tensor_fp8_and_per_channel_int4_weight + ): + return False + + # Dynamic quantization is always supported if weights supported. + if input_quant.get("is_dynamic"): + return True + + # Confirm activation scheme is supported. + is_per_tensor_activation = input_quant.get("qscheme") == "per_tensor" + return is_per_tensor_activation + def _is_fp8_w8a8( self, weight_quant: dict[str, Any] | None, diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index d84e22d1fa0f2..0b9b098afb1f6 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -63,8 +63,9 @@ class QuarkMoEMethod(FusedMoEMethodBase): ) weight_config = layer_quant_config.get("weight") input_config = layer_quant_config.get("input_tensors") - - if quant_config._is_fp8_w8a8(weight_config, input_config): + if quant_config._is_fp8_w4a8(weight_config, input_config): + return QuarkW4A8Fp8MoEMethod(weight_config, input_config, module.moe_config) + elif quant_config._is_fp8_w8a8(weight_config, input_config): return QuarkW8A8Fp8MoEMethod(weight_config, input_config, module.moe_config) elif quant_config._is_ocp_mx(weight_config, input_config): return QuarkOCP_MX_MoEMethod(weight_config, input_config, module.moe_config) @@ -396,6 +397,161 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): ) +class QuarkW4A8Fp8MoEMethod(QuarkMoEMethod): + def __init__( + self, + weight_config: dict[str, Any], + input_config: dict[str, Any], + moe: FusedMoEConfig, + ): + super().__init__(moe) + self.weight_quant = weight_config + self.input_quant = input_config + + assert rocm_aiter_ops.is_fused_moe_enabled(), ( + "W4A8 FP8 MoE requires ROCm AITER fused MoE support." + ) + + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + params_dtype = torch.uint32 + w13_weight = torch.nn.Parameter( + torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size // 8, # INT32 packing for W4 + dtype=params_dtype, + ), + requires_grad=False, + ) + w2_weight = torch.nn.Parameter( + torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition // 8, # INT32 packing for W4 + dtype=params_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight", w13_weight) + layer.register_parameter("w2_weight", w2_weight) + set_weight_attrs(w13_weight, extra_weight_attrs) + set_weight_attrs(w2_weight, extra_weight_attrs) + + # Per-tensor fp8 weight scales + w13_weight_scale = torch.nn.Parameter( + torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False + ) + w2_weight_scale = torch.nn.Parameter( + torch.ones(num_experts, dtype=torch.float32), requires_grad=False + ) + layer.register_parameter("w13_weight_scale", w13_weight_scale) + layer.register_parameter("w2_weight_scale", w2_weight_scale) + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value} + ) + set_weight_attrs(w13_weight_scale, extra_weight_attrs) + set_weight_attrs(w2_weight_scale, extra_weight_attrs) + + # Per-channel int4 weight scales + w13_weight_scale_2 = torch.nn.Parameter( + torch.ones( + num_experts, + 2 * intermediate_size_per_partition, + dtype=torch.float32, + ), + requires_grad=False, + ) + w2_weight_scale_2 = torch.nn.Parameter( + torch.ones(num_experts, hidden_size, dtype=torch.float32), + requires_grad=False, + ) + layer.register_parameter("w13_weight_scale_2", w13_weight_scale_2) + layer.register_parameter("w2_weight_scale_2", w2_weight_scale_2) + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value} + ) + set_weight_attrs(w13_weight_scale_2, extra_weight_attrs) + set_weight_attrs(w2_weight_scale_2, extra_weight_attrs) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights( + layer.w13_weight.data, layer.w2_weight.data + ) + layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False) + layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False) + + # INT4-FP8 : offset INT4 w13_weight_scale1 to single w13_weight_scale + # Fp8 moe kernel needs single fp8 w13_weight_scale for w13 per expert. + # We won't do requant each expert's fp8 weight (not direct available), + # instead we adjust half of INT4 w13_weight_scale1 numbers + shard_size = layer.intermediate_size_per_partition + max_w13_scales = layer.w13_weight_scale.max(dim=1).values + assert torch.all(max_w13_scales != 0), "fp8 weight scale cannot be zero." + for expert_id in range(layer.local_num_experts): + start = 0 + max_w13_scale_fp8 = max_w13_scales[expert_id] + for shard_id in range(2): + if layer.w13_weight_scale[expert_id][shard_id] != max_w13_scale_fp8: + int4_rescale = ( + layer.w13_weight_scale[expert_id][shard_id] / max_w13_scale_fp8 + ) + layer.w13_weight_scale_2[expert_id][start : start + shard_size] *= ( + int4_rescale + ) + start += shard_size + + layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales, requires_grad=False) + + # special hack to asm_moe, which takes (weight_scale1 * weight_scale) as post + # GEMM scaling optimal design - shall apply per-column weight_scale1 before + # GEMM, and weight_scale post + for expert_id in range(layer.local_num_experts): + layer.w13_weight_scale_2[expert_id] *= max_w13_scales[expert_id] + layer.w2_weight_scale_2[expert_id] *= layer.w2_weight_scale[expert_id] + + def get_fused_moe_quant_config(self, layer): + return fp8_w8a8_moe_quant_config( + w1_scale=layer.w13_weight_scale_2, + w2_scale=layer.w2_weight_scale_2, + per_out_ch_quant=True, + ) + + def apply( + self, + layer: FusedMoE, + x: torch.Tensor, + router_logits: torch.Tensor, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + topk_weights, topk_ids, _ = layer.select_experts( + hidden_states=x, + router_logits=router_logits, + ) + + from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( + rocm_aiter_fused_experts, + ) + + return rocm_aiter_fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + activation=layer.activation, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + quant_config=self.moe_quant_config, + expert_map=layer.expert_map, + ) + + class QuarkOCP_MX_MoEMethod(QuarkMoEMethod): def __init__( self, From 4a8412f773c67e0ba1eb1d4992095d6e0204f0ce Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Wed, 17 Dec 2025 23:21:51 -0500 Subject: [PATCH 043/176] [UX] Reduce DeepGEMM warmup log output to single progress bar (#30903) Signed-off-by: Matthew Bonanni --- .../model_executor/warmup/deep_gemm_warmup.py | 141 ++++++++++++------ 1 file changed, 99 insertions(+), 42 deletions(-) diff --git a/vllm/model_executor/warmup/deep_gemm_warmup.py b/vllm/model_executor/warmup/deep_gemm_warmup.py index 936f6b1e28ce1..2bbc655bd935f 100644 --- a/vllm/model_executor/warmup/deep_gemm_warmup.py +++ b/vllm/model_executor/warmup/deep_gemm_warmup.py @@ -10,7 +10,7 @@ import torch from tqdm import tqdm import vllm.envs as envs -from vllm.distributed.parallel_state import get_dp_group +from vllm.distributed.parallel_state import get_dp_group, is_global_first_rank from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts from vllm.model_executor.layers.fused_moe.deep_gemm_utils import compute_aligned_M from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEModularMethod @@ -175,7 +175,30 @@ def _fused_moe_grouped_gemm_may_use_deep_gemm(module: torch.nn.Module) -> bool: FP8_GEMM_NT_WARMUP_CACHE: set[torch.Size] = set() -def _deepgemm_fp8_gemm_nt_warmup(w: torch.Tensor, ws: torch.Tensor, max_tokens: int): +def _get_fp8_gemm_nt_m_values(w: torch.Tensor, max_tokens: int) -> list[int]: + """Get the M values to warmup for a given weight tensor.""" + n, _ = w.size() + device = w.device + + # Use optimal M values only if VLLM_DEEP_GEMM_WARMUP is set to "relax". + # Otherwise warmup all token sizes to avoid JIT compilation in hotpath + if envs.VLLM_DEEP_GEMM_WARMUP == "relax": + return _generate_optimal_warmup_m_values(max_tokens, n, device) + else: + assert envs.VLLM_DEEP_GEMM_WARMUP == "full", ( + "Expected " + 'VLLM_DEEP_GEMM_WARMUP env to be set to "full" but got ' + f"{envs.VLLM_DEEP_GEMM_WARMUP}" + ) + return list(range(1, max_tokens + 1)) + + +def _deepgemm_fp8_gemm_nt_warmup( + w: torch.Tensor, + ws: torch.Tensor, + max_tokens: int, + pbar: tqdm | None = None, +): if w.size() in FP8_GEMM_NT_WARMUP_CACHE: return @@ -189,27 +212,14 @@ def _deepgemm_fp8_gemm_nt_warmup(w: torch.Tensor, ws: torch.Tensor, max_tokens: ) out = torch.empty((max_tokens, n), device=device, dtype=torch.bfloat16) - # Use optimal M values only if VLLM_DEEP_GEMM_WARMUP is set to "relax". - # Otherwise warmup all token sizes to avoid JIT compilation in hotpath - if envs.VLLM_DEEP_GEMM_WARMUP == "relax": - m_values = _generate_optimal_warmup_m_values(max_tokens, n, device) - desc = f"DeepGemm(fp8_gemm_nt) warmup (W={w.size()}) [relaxed]" - else: - assert envs.VLLM_DEEP_GEMM_WARMUP == "full", ( - "Expected " - 'VLLM_DEEP_GEMM_WARMUP env to be set to "full" but got ' - f"{envs.VLLM_DEEP_GEMM_WARMUP}" - ) - m_values = list(range(1, max_tokens + 1)) - desc = f"DeepGemm(fp8_gemm_nt) warmup (W={w.size()}) [all tokens]" - - pbar = tqdm(total=len(m_values), desc=desc) + m_values = _get_fp8_gemm_nt_m_values(w, max_tokens) for num_tokens in m_values: fp8_gemm_nt( (a1q[:num_tokens], a1q_scales[:num_tokens]), (w, ws), out[:num_tokens] ) - pbar.update(1) + if pbar is not None: + pbar.update(1) FP8_GEMM_NT_WARMUP_CACHE.add(w.size()) @@ -217,20 +227,12 @@ def _deepgemm_fp8_gemm_nt_warmup(w: torch.Tensor, ws: torch.Tensor, max_tokens: GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE: set[torch.Size] = set() -def _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup( +def _get_grouped_gemm_params( w1: torch.Tensor, w2: torch.Tensor, - w1_scale: torch.Tensor, - w2_scale: torch.Tensor, num_topk: int, max_tokens: int, -): - if ( - w1.size() in GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE - and w2.size() in GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE - ): - return - +) -> tuple[int, int, torch.Tensor]: assert w1.size(0) == w2.size(0), "w1 and w2 must have the same number of experts" block_m = get_mk_alignment_for_contiguous_layout()[0] @@ -253,6 +255,27 @@ def _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup( ) expert_ids = torch.repeat_interleave(expert_ids_block, block_m, dim=0) + return MAX_M, block_m, expert_ids + + +def _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup( + w1: torch.Tensor, + w2: torch.Tensor, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + num_topk: int, + max_tokens: int, + pbar: tqdm | None = None, +): + if ( + w1.size() in GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE + and w2.size() in GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE + ): + return + + MAX_M, block_m, expert_ids = _get_grouped_gemm_params(w1, w2, num_topk, max_tokens) + device = w1.device + def _warmup(w: torch.Tensor, w_scale: torch.Tensor): _, n, k = w.size() a1q = torch.empty((MAX_M, k), device=device, dtype=torch.float8_e4m3fn) @@ -261,15 +284,8 @@ def _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup( ) out = torch.empty((MAX_M, n), device=device, dtype=torch.bfloat16) - # Generate M values in block_m increments (already optimized for MoE) m_values = list(range(block_m, MAX_M + 1, block_m)) - pbar = tqdm( - total=len(m_values), - desc=f"DeepGemm(m_grouped_fp8_gemm_nt_contiguous) warmup (W={w.size()}) " - f"[{len(m_values)} values, block_m={block_m}]", - ) - for num_tokens in m_values: m_grouped_fp8_gemm_nt_contiguous( (a1q[:num_tokens], a1q_scales[:num_tokens]), @@ -277,7 +293,8 @@ def _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup( out[:num_tokens], expert_ids[:num_tokens], ) - pbar.update(1) + if pbar is not None: + pbar.update(1) for w, ws in [(w1, w1_scale), (w2, w2_scale)]: if w.size() not in GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE: @@ -285,16 +302,18 @@ def _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup( GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE.add(w.size()) -def deepgemm_fp8_gemm_nt_warmup(model: torch.nn.Module, max_tokens: int): +def deepgemm_fp8_gemm_nt_warmup( + model: torch.nn.Module, max_tokens: int, pbar: tqdm | None = None +): dg_modules = [m for m in model.modules() if _fp8_linear_may_use_deep_gemm(m)] for dgm in dg_modules: w, ws, _ = _extract_data_from_linear_base_module(dgm) - _deepgemm_fp8_gemm_nt_warmup(w=w, ws=ws, max_tokens=max_tokens) + _deepgemm_fp8_gemm_nt_warmup(w=w, ws=ws, max_tokens=max_tokens, pbar=pbar) def deepgemm_grouped_fp8_gemm_nt_contiguous_warmup( - model: torch.nn.Module, max_tokens: int + model: torch.nn.Module, max_tokens: int, pbar: tqdm | None = None ): dg_modules = [ m for m in model.modules() if _fused_moe_grouped_gemm_may_use_deep_gemm(m) @@ -305,10 +324,48 @@ def deepgemm_grouped_fp8_gemm_nt_contiguous_warmup( dgm ) _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup( - w13, w2, w13_scale, w2_scale, num_topk, max_tokens + w13, w2, w13_scale, w2_scale, num_topk, max_tokens, pbar=pbar ) +def _count_warmup_iterations(model: torch.nn.Module, max_tokens: int) -> int: + seen_fp8_sizes: set[torch.Size] = set(FP8_GEMM_NT_WARMUP_CACHE) + seen_grouped_sizes: set[torch.Size] = set( + GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE + ) + + total = 0 + for m in model.modules(): + if _fp8_linear_may_use_deep_gemm(m): + w, _, _ = _extract_data_from_linear_base_module(m) + if w.size() not in seen_fp8_sizes: + total += len(_get_fp8_gemm_nt_m_values(w, max_tokens)) + seen_fp8_sizes.add(w.size()) + elif _fused_moe_grouped_gemm_may_use_deep_gemm(m): + w13, _, w2, _, num_topk = _extract_data_from_fused_moe_module(m) + if w13.size() in seen_grouped_sizes and w2.size() in seen_grouped_sizes: + continue + MAX_M, block_m, _ = _get_grouped_gemm_params(w13, w2, num_topk, max_tokens) + n_values = (MAX_M - block_m) // block_m + 1 + if w13.size() not in seen_grouped_sizes: + total += n_values + seen_grouped_sizes.add(w13.size()) + if w2.size() not in seen_grouped_sizes: + total += n_values + seen_grouped_sizes.add(w2.size()) + return total + + def deep_gemm_warmup(model: torch.nn.Module, max_tokens: int): - deepgemm_fp8_gemm_nt_warmup(model, max_tokens) - deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(model, max_tokens) + total = _count_warmup_iterations(model, max_tokens) + if total == 0: + return + + # Only show progress bar on rank 0 to avoid cluttered output + if is_global_first_rank(): + with tqdm(total=total, desc="DeepGEMM warmup") as pbar: + deepgemm_fp8_gemm_nt_warmup(model, max_tokens, pbar) + deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(model, max_tokens, pbar) + else: + deepgemm_fp8_gemm_nt_warmup(model, max_tokens, None) + deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(model, max_tokens, None) From 5f2f3fba1d9ed0aa433171b86c415a5f02055035 Mon Sep 17 00:00:00 2001 From: Zhengxu Chen Date: Wed, 17 Dec 2025 23:22:23 -0500 Subject: [PATCH 044/176] [compile] Fix CI for test_gpt2_cache_hit (#30902) Signed-off-by: zhxchen17 --- tests/compile/test_aot_compile.py | 11 ++++++++++- vllm/config/compilation.py | 10 +++++----- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/tests/compile/test_aot_compile.py b/tests/compile/test_aot_compile.py index 8fa305d6d72f5..2ffcd627e476a 100644 --- a/tests/compile/test_aot_compile.py +++ b/tests/compile/test_aot_compile.py @@ -9,6 +9,7 @@ from contextlib import contextmanager import pytest import torch +import vllm.model_executor.layers.activation from vllm.compilation.decorators import support_torch_compile from vllm.config import ( CompilationConfig, @@ -16,9 +17,12 @@ from vllm.config import ( VllmConfig, set_current_vllm_config, ) +from vllm.envs import disable_envs_cache from vllm.forward_context import set_forward_context from vllm.utils.torch_utils import is_torch_equal_or_newer +from ..utils import create_new_process_for_each_test + def reference_fn(x: torch.Tensor): assert x.shape[0] <= 42 @@ -66,6 +70,7 @@ def test_no_dynamo_cache_entry(monkeypatch: pytest.MonkeyPatch): torch.compiler.set_stance("fail_on_recompile"), ): CompiledMod(vllm_config=vllm_config)(*args) + disable_envs_cache() m.setenv("VLLM_USE_AOT_COMPILE", "1") torch._dynamo.reset() @@ -101,6 +106,7 @@ def test_save_and_load(monkeypatch: pytest.MonkeyPatch): vllm_config = make_vllm_config() with use_vllm_config(vllm_config): expected = CompiledMod(vllm_config=vllm_config)(*args) + disable_envs_cache() m.setenv("VLLM_FORCE_AOT_LOAD", "1") vllm_config = make_vllm_config() @@ -130,6 +136,7 @@ def test_shape_env(monkeypatch: pytest.MonkeyPatch): artifacts = compiled_mod.aot_compiled_fn._artifacts guards_string = artifacts.compiled_fn.shape_env.format_guards() assert guards_string == " - s77 <= 42\n - Eq(Mod(s77, 2), 0)" + disable_envs_cache() m.setenv("VLLM_FORCE_AOT_LOAD", "1") vllm_config = make_vllm_config() @@ -144,7 +151,7 @@ def test_shape_env(monkeypatch: pytest.MonkeyPatch): @pytest.mark.skipif( not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10" ) -@use_vllm_config(make_vllm_config()) +@create_new_process_for_each_test("spawn") def test_gpt2_cache_hit(monkeypatch: pytest.MonkeyPatch): """ Test that compiling gpt2 twice results in a cache hit and @@ -186,6 +193,8 @@ def test_gpt2_cache_hit(monkeypatch: pytest.MonkeyPatch): # Clean up first model del llm_model + disable_envs_cache() + vllm.model_executor.layers.activation._ACTIVATION_REGISTRY._dict.clear() # Second compilation - should hit cache m.setenv("VLLM_FORCE_AOT_LOAD", "1") diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 4a98494b3c7b3..3e3ee1e572ec8 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -437,14 +437,14 @@ class CompilationConfig: compile_ranges_split_points: list[int] | None = None """Split points that represent compile ranges for inductor. - The compile ranges are - [1, split_points[0]], - [split_points[0] + 1, split_points[1]], ..., + The compile ranges are + [1, split_points[0]], + [split_points[0] + 1, split_points[1]], ..., [split_points[-1] + 1, max_num_batched_tokens]. Compile sizes are also used single element ranges, the range is represented as [compile_sizes[i], compile_sizes[i]]. - - If a range overlaps with the compile size, graph for compile size + + If a range overlaps with the compile size, graph for compile size will be prioritized, i.e. if we have a range [1, 8] and a compile size 4, graph for compile size 4 will be compiled and used instead of the graph for range [1, 8]. From b166ef20e1e5256913b34456507d89850a8dba38 Mon Sep 17 00:00:00 2001 From: zzhxxx Date: Thu, 18 Dec 2025 12:45:56 +0800 Subject: [PATCH 045/176] [refactor] Add prefix support to embed_tokens in DeepSeek MTP (#30788) Signed-off-by: zzhx1 --- vllm/model_executor/models/deepseek_mtp.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py index ca77b8322e2e8..c25e8422da157 100644 --- a/vllm/model_executor/models/deepseek_mtp.py +++ b/vllm/model_executor/models/deepseek_mtp.py @@ -141,6 +141,7 @@ class DeepSeekMultiTokenPredictor(nn.Module): self.embed_tokens = VocabParallelEmbedding( config.vocab_size, config.hidden_size, + prefix=maybe_prefix(prefix, "embed_tokens"), ) self.logits_processor = LogitsProcessor(config.vocab_size) From cfb7e55515a5558be3a7199044411953017352d3 Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Thu, 18 Dec 2025 12:59:09 +0800 Subject: [PATCH 046/176] [Doc][CPU] Update CPU doc (#30765) Signed-off-by: jiang1.li Signed-off-by: Li, Jiang Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- docker/Dockerfile.cpu | 4 +- .../installation/cpu.arm.inc.md | 34 ++++++++- docs/getting_started/installation/cpu.md | 11 +-- .../installation/cpu.x86.inc.md | 71 +++++++++++++++++-- .../installation/python_env_setup.inc.md | 2 +- 5 files changed, 106 insertions(+), 16 deletions(-) diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index 8d55ecfba3e52..bd5bc43916eac 100644 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -17,7 +17,7 @@ # VLLM_CPU_DISABLE_AVX512=false (default)|true # VLLM_CPU_AVX512BF16=false (default)|true # VLLM_CPU_AVX512VNNI=false (default)|true -# VLLM_CPU_AMXBF16=false (default)|true +# VLLM_CPU_AMXBF16=false |true (default) # ######################### COMMON BASE IMAGE ######################### @@ -95,7 +95,7 @@ ENV VLLM_CPU_AVX512BF16=${VLLM_CPU_AVX512BF16} ARG VLLM_CPU_AVX512VNNI=0 ENV VLLM_CPU_AVX512VNNI=${VLLM_CPU_AVX512VNNI} # Support for building with AMXBF16 ISA: docker build --build-arg VLLM_CPU_AMXBF16="true" ... -ARG VLLM_CPU_AMXBF16=0 +ARG VLLM_CPU_AMXBF16=1 ENV VLLM_CPU_AMXBF16=${VLLM_CPU_AMXBF16} WORKDIR /workspace/vllm diff --git a/docs/getting_started/installation/cpu.arm.inc.md b/docs/getting_started/installation/cpu.arm.inc.md index 657bf2509db01..4940e5781b29a 100644 --- a/docs/getting_started/installation/cpu.arm.inc.md +++ b/docs/getting_started/installation/cpu.arm.inc.md @@ -19,7 +19,7 @@ Pre-built vLLM wheels for Arm are available since version 0.11.2. These wheels c ```bash export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//') -uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu +uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu --index-strategy first-index ``` ??? console "pip" @@ -27,6 +27,20 @@ uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu pip install vllm==${VLLM_VERSION}+cpu --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu ``` +!!! warning "set `LD_PRELOAD`" + Before use vLLM CPU installed via wheels, make sure TCMalloc is installed and added to `LD_PRELOAD`: + ```bash + # install TCMalloc + sudo apt-get install -y --no-install-recommends libtcmalloc-minimal4 + + # manually find the path + sudo find / -iname *libtcmalloc_minimal.so.4 + TC_PATH=... + + # add them to LD_PRELOAD + export LD_PRELOAD="$TC_PATH:$LD_PRELOAD" + ``` + The `uv` approach works for vLLM `v0.6.6` and later. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. **Install the latest code** @@ -37,7 +51,7 @@ LLM inference is a fast-evolving field, and the latest code may contain bug fixe To install from nightly index, run: ```bash -uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly/cpu +uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly/cpu --index-strategy first-index ``` ??? console "pip (there's a caveat)" @@ -56,7 +70,7 @@ If you want to access the wheels for previous commits (e.g. to bisect the behavi ```bash export VLLM_COMMIT=730bd35378bf2a5b56b6d3a45be28b3092d26519 # use full commit hash from the main branch -uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}/cpu +uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}/cpu --index-strategy first-index ``` # --8<-- [end:pre-built-wheels] @@ -105,6 +119,20 @@ VLLM_TARGET_DEVICE=cpu uv pip install -e . --no-build-isolation Testing has been conducted on AWS Graviton3 instances for compatibility. +!!! warning "set `LD_PRELOAD`" + Before use vLLM CPU installed via wheels, make sure TCMalloc is installed and added to `LD_PRELOAD`: + ```bash + # install TCMalloc + sudo apt-get install -y --no-install-recommends libtcmalloc-minimal4 + + # manually find the path + sudo find / -iname *libtcmalloc_minimal.so.4 + TC_PATH=... + + # add them to LD_PRELOAD + export LD_PRELOAD="$TC_PATH:$LD_PRELOAD" + ``` + # --8<-- [end:build-wheel-from-source] # --8<-- [start:pre-built-images] diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md index 210f720e2d92a..affb94593dd42 100644 --- a/docs/getting_started/installation/cpu.md +++ b/docs/getting_started/installation/cpu.md @@ -18,6 +18,12 @@ vLLM is a Python library that supports the following CPU variants. Select your C --8<-- "docs/getting_started/installation/cpu.s390x.inc.md:installation" +## Technical Discussions + +The main discussions happen in the `#sig-cpu` channel of [vLLM Slack](https://slack.vllm.ai/). + +When open a Github issue about the CPU backend, please add `[CPU Backend]` in the title and it will be labeled with `cpu` for better awareness. + ## Requirements - Python: 3.10 -- 3.13 @@ -258,11 +264,6 @@ vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel - GPTQ (x86 only) - compressed-tensor INT8 W8A8 (x86, s390x) -### (x86 only) What is the purpose of `VLLM_CPU_SGL_KERNEL`? - -- Both of them require `amx` CPU flag. - - `VLLM_CPU_SGL_KERNEL` can provide better performance for MoE models and small-batch scenarios. - ### Why do I see `get_mempolicy: Operation not permitted` when running in Docker? In some container environments (like Docker), NUMA-related syscalls used by vLLM (e.g., `get_mempolicy`, `migrate_pages`) are blocked/denied in the runtime's default seccomp/capabilities settings. This may lead to warnings like `get_mempolicy: Operation not permitted`. Functionality is not affected, but NUMA memory binding/migration optimizations may not take effect and performance can be suboptimal. diff --git a/docs/getting_started/installation/cpu.x86.inc.md b/docs/getting_started/installation/cpu.x86.inc.md index 1fad7f4338822..01e34eee10539 100644 --- a/docs/getting_started/installation/cpu.x86.inc.md +++ b/docs/getting_started/installation/cpu.x86.inc.md @@ -17,7 +17,51 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data # --8<-- [end:set-up-using-python] # --8<-- [start:pre-built-wheels] -Currently, there are no pre-built x86 CPU wheels. +Pre-built vLLM wheels for x86 with AVX512 are available since version 0.13.0. To install release wheels: + +```bash +export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//') + +# use uv +uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu --index-strategy first-index --torch-backend cpu +``` +??? console "pip" + ```bash + # use pip + pip install vllm==${VLLM_VERSION}+cpu --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu --extra-index-url https://download.pytorch.org/whl/cpu + ``` +!!! warning "set `LD_PRELOAD`" + Before use vLLM CPU installed via wheels, make sure TCMalloc and Intel OpenMP are installed and added to `LD_PRELOAD`: + ```bash + # install TCMalloc, Intel OpenMP is installed with vLLM CPU + sudo apt-get install -y --no-install-recommends libtcmalloc-minimal4 + + # manually find the path + sudo find / -iname *libtcmalloc_minimal.so.4 + sudo find / -iname *libiomp5.so + TC_PATH=... + IOMP_PATH=... + + # add them to LD_PRELOAD + export LD_PRELOAD="$TC_PATH:$IOMP_PATH:$LD_PRELOAD" + ``` + +**Install the latest code** + +To install the wheel built from the latest main branch: + +```bash +uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly/cpu --index-strategy first-index --torch-backend cpu +``` + +**Install specific revisions** + +If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL: + +```bash +export VLLM_COMMIT=730bd35378bf2a5b56b6d3a45be28b3092d26519 # use full commit hash from the main branch +uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}/cpu --index-strategy first-index --torch-backend cpu +``` # --8<-- [end:pre-built-wheels] # --8<-- [start:build-wheel-from-source] @@ -26,10 +70,12 @@ Install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the def ```bash sudo apt-get update -y -sudo apt-get install -y gcc-12 g++-12 libnuma-dev python3-dev +sudo apt-get install -y gcc-12 g++-12 libnuma-dev sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 ``` +--8<-- "docs/getting_started/installation/python_env_setup.inc.md" + Clone the vLLM project: ```bash @@ -82,6 +128,22 @@ uv pip install dist/*.whl pip install dist/*.whl ``` +!!! warning "set `LD_PRELOAD`" + Before use vLLM CPU installed via wheels, make sure TCMalloc and Intel OpenMP are installed and added to `LD_PRELOAD`: + ```bash + # install TCMalloc, Intel OpenMP is installed with vLLM CPU + sudo apt-get install -y --no-install-recommends libtcmalloc-minimal4 + + # manually find the path + sudo find / -iname *libtcmalloc_minimal.so.4 + sudo find / -iname *libiomp5.so + TC_PATH=... + IOMP_PATH=... + + # add them to LD_PRELOAD + export LD_PRELOAD="$TC_PATH:$IOMP_PATH:$LD_PRELOAD" + ``` + !!! example "Troubleshooting" - **NumPy ≥2.0 error**: Downgrade using `pip install "numpy<2.0"`. - **CMake picks up CUDA**: Add `CMAKE_DISABLE_FIND_PACKAGE_CUDA=ON` to prevent CUDA detection during CPU builds, even if CUDA is installed. @@ -95,7 +157,6 @@ uv pip install dist/*.whl "torch==X.Y.Z+cpu" # <------- ] ``` - - If you are building vLLM from source and not using the pre-built images, remember to set `LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD"` on x86 machines before running vLLM. # --8<-- [end:build-wheel-from-source] # --8<-- [start:pre-built-images] @@ -112,6 +173,7 @@ uv pip install dist/*.whl docker build -f docker/Dockerfile.cpu \ --build-arg VLLM_CPU_AVX512BF16=false (default)|true \ --build-arg VLLM_CPU_AVX512VNNI=false (default)|true \ + --build-arg VLLM_CPU_AMXBF16=false|true (default) \ --build-arg VLLM_CPU_DISABLE_AVX512=false (default)|true \ --tag vllm-cpu-env \ --target vllm-openai . @@ -123,9 +185,8 @@ docker run --rm \ --shm-size=4g \ -p 8000:8000 \ -e VLLM_CPU_KVCACHE_SPACE= \ - -e VLLM_CPU_OMP_THREADS_BIND= \ vllm-cpu-env \ - --model=meta-llama/Llama-3.2-1B-Instruct \ + meta-llama/Llama-3.2-1B-Instruct \ --dtype=bfloat16 \ other vLLM OpenAI server arguments ``` diff --git a/docs/getting_started/installation/python_env_setup.inc.md b/docs/getting_started/installation/python_env_setup.inc.md index ba78c329723ed..06794f8d3120e 100644 --- a/docs/getting_started/installation/python_env_setup.inc.md +++ b/docs/getting_started/installation/python_env_setup.inc.md @@ -1,4 +1,4 @@ -On NVIDIA CUDA only, it's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following commands: +It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following commands: ```bash uv venv --python 3.12 --seed From 717ac33d9cfa82357ea57dc0f6ee2aa325eba9f2 Mon Sep 17 00:00:00 2001 From: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com> Date: Thu, 18 Dec 2025 09:16:04 +0400 Subject: [PATCH 047/176] [PERF] Qwen3-next. Add fp8 cutlass MoE tuned configs. `chmod -x *MI308X.json` (#29553) Signed-off-by: Vadim Gimpelson --- ...,dtype=fp8_w8a8,block_shape=[128,128].json | 147 ++++++++++++++++++ ...N=768,device_name=AMD_Instinct_MI308X.json | 0 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 147 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 147 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 147 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 147 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 147 ++++++++++++++++++ 7 files changed, 882 insertions(+) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json mode change 100755 => 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI308X.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..3859583fb31f2 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,147 @@ +{ + "triton_version": "3.5.0", + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI308X.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI308X.json old mode 100755 new mode 100644 diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..b03a587294217 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,147 @@ +{ + "triton_version": "3.5.0", + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..7e57e97eef8a7 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,147 @@ +{ + "triton_version": "3.5.0", + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..4438d15c56949 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,147 @@ +{ + "triton_version": "3.5.0", + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..93f7227b11269 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,147 @@ +{ + "triton_version": "3.5.0", + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..694dbf47b2074 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,147 @@ +{ + "triton_version": "3.5.0", + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} From 82dc338ad609c9a91b0ae764c7961083784cd620 Mon Sep 17 00:00:00 2001 From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com> Date: Wed, 17 Dec 2025 23:18:26 -0600 Subject: [PATCH 048/176] [AMD][CI] fix lm eval ci arg (#30911) Signed-off-by: Divakar Verma --- .buildkite/test-amd.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 6df373632d730..f294261ec8c3a 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -754,7 +754,7 @@ steps: - vllm/model_executor/layers/quantization autorun_on_main: true commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt - label: OpenAI API correctness # 10min timeout_in_minutes: 15 @@ -1203,7 +1203,7 @@ steps: - csrc/ - vllm/model_executor/layers/quantization commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1 + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt ##### 1 GPU test ##### ##### multi gpus test ##### @@ -1521,7 +1521,7 @@ steps: - csrc/ - vllm/model_executor/layers/quantization commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt - label: LM Eval Large Models (4 Card) mirror_hardwares: [amdexperimental, amdproduction] From ec965569d94c09eb1c85d235319b24d1b795d048 Mon Sep 17 00:00:00 2001 From: Yihua Cheng Date: Wed, 17 Dec 2025 21:31:34 -0800 Subject: [PATCH 049/176] [KV connector][LMCache] Only record the cuda event when there are request to store/load (#30814) Signed-off-by: ApostaC --- .../multi_process_adapter.py | 1 + .../kv_connector/v1/lmcache_mp_connector.py | 56 +++++++++++++------ 2 files changed, 40 insertions(+), 17 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py index 6acfb73997f25..6656b5a25f83d 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py @@ -262,6 +262,7 @@ class LMCacheMPWorkerAdapter: ): keys = [] block_ids = [] + for op in ops: keys.extend(self._block_hashes_to_keys(op.block_hashes)) block_ids.extend(op.block_ids) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py index 78256a6552c22..995708b89bc26 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py @@ -24,6 +24,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.lmcache_integration import ( ) from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.outputs import KVConnectorOutput +from vllm.v1.request import RequestStatus from vllm.v1.utils import ConstantList if TYPE_CHECKING: @@ -211,7 +212,7 @@ class LMCacheMPRequestTracker: """ self.num_stored_blocks += num_new_blocks - def update_block_ids( + def append_block_ids( self, new_block_ids: list[int], ): @@ -455,10 +456,6 @@ class LMCacheMPConnector(KVConnectorBase_V1): metadata = self._get_connector_metadata() assert isinstance(metadata, LMCacheMPConnectorMetadata) - with torch.cuda.stream(torch.cuda.current_stream()): - event = torch.cuda.Event(interprocess=True) - event.record() - request_ids = [] ops = [] @@ -468,10 +465,14 @@ class LMCacheMPConnector(KVConnectorBase_V1): request_ids.append(meta.request_id) ops.append(meta.op) - if len(request_ids) > 0: - self.worker_adapter.batched_submit_retrieve_requests( - request_ids, ops, event - ) + if len(request_ids) == 0: + return + + with torch.cuda.stream(torch.cuda.current_stream()): + event = torch.cuda.Event(interprocess=True) + event.record() + + self.worker_adapter.batched_submit_retrieve_requests(request_ids, ops, event) def wait_for_layer_load(self, layer_name: str) -> None: """ @@ -518,10 +519,6 @@ class LMCacheMPConnector(KVConnectorBase_V1): metadata = self._get_connector_metadata() assert isinstance(metadata, LMCacheMPConnectorMetadata) - with torch.cuda.stream(torch.cuda.current_stream()): - event = torch.cuda.Event(interprocess=True) - event.record() - request_ids = [] ops = [] for meta in metadata.requests: @@ -530,8 +527,14 @@ class LMCacheMPConnector(KVConnectorBase_V1): request_ids.append(meta.request_id) ops.append(meta.op) - if len(request_ids) > 0: - self.worker_adapter.batched_submit_store_requests(request_ids, ops, event) + if len(request_ids) == 0: + return + + with torch.cuda.stream(torch.cuda.current_stream()): + event = torch.cuda.Event(interprocess=True) + event.record() + + self.worker_adapter.batched_submit_store_requests(request_ids, ops, event) def get_finished( self, finished_req_ids: set[str] @@ -627,6 +630,9 @@ class LMCacheMPConnector(KVConnectorBase_V1): into account. """ tracker = self._get_or_create_request_tracker(request) + # TODO: support loading KV for preempted requests in the future + if request.status == RequestStatus.PREEMPTED: + return 0, False self.scheduler_adapter.maybe_submit_lookup_request( request.request_id, convert_block_hashes_to_bytes(request.block_hashes) @@ -683,7 +689,7 @@ class LMCacheMPConnector(KVConnectorBase_V1): # No matter we need to retrieve or not, we need to update # the block ids into the tracker - tracker.update_block_ids(block_ids) + tracker.append_block_ids(block_ids) # Update the state of the tracker condition = tracker.needs_retrieve() @@ -866,7 +872,8 @@ class LMCacheMPConnector(KVConnectorBase_V1): # Update block ids new_block_ids = reformat_block_ids(cached_reqs.new_block_ids[idx]) - request_tracker.update_block_ids(new_block_ids) + if request_id not in cached_reqs.resumed_req_ids: + request_tracker.append_block_ids(new_block_ids) # Update new scheduled tokens num_new_tokens = cached_reqs.num_computed_tokens[idx] @@ -889,6 +896,21 @@ class LMCacheMPConnector(KVConnectorBase_V1): self, request: "Request" ) -> LMCacheMPRequestTracker: request_id = request.request_id + # Remove the old trackers that is created before the preemption + if ( + request.status == RequestStatus.PREEMPTED + and request_id in self.request_trackers + ): + tracker = self.request_trackers[request_id] + + # NOTE: since this function may be called multiple times + # for a single request (because get_num_new_matched_tokens + # may be called multiple times) for the same request, we + # will only do the remove if the tracker is not in the "fresh" + # state, i.e., PREFETCHING + if tracker.state != LMCacheMPRequestState.PREFETCHING: + self.request_trackers.pop(request_id) + if request_id not in self.request_trackers: new_tracker = LMCacheMPRequestTracker(request) self.request_trackers[request_id] = new_tracker From fc2ae6d6177c0b27a10f6c930e335ac0ec240982 Mon Sep 17 00:00:00 2001 From: Nathan Price <125999937+TheCodeWrangler@users.noreply.github.com> Date: Thu, 18 Dec 2025 00:12:29 -0600 Subject: [PATCH 050/176] fix: add warmup for audio preprocessing (#30706) Signed-off-by: Nathan Price Co-authored-by: Cyrus Leung --- vllm/entrypoints/openai/speech_to_text.py | 127 +++++++++++++++++++++- 1 file changed, 126 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py index df9c06adb105a..3e648f44f380b 100644 --- a/vllm/entrypoints/openai/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text.py @@ -35,7 +35,7 @@ from vllm.entrypoints.openai.serving_engine import OpenAIServing, SpeechToTextRe from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.inputs.data import PromptType from vllm.logger import init_logger -from vllm.model_executor.models import SupportsTranscription +from vllm.model_executor.models import SupportsTranscription, supports_transcription from vllm.outputs import RequestOutput from vllm.tokenizers import get_tokenizer from vllm.utils.import_utils import PlaceholderModule @@ -112,6 +112,131 @@ class OpenAISpeechToText(OpenAIServing): self.default_sampling_params, ) + # Warm up audio preprocessing to avoid first-request latency + self._warmup_audio_preprocessing() + # Warm up input processor with dummy audio + self._warmup_input_processor() + + def _warmup_audio_preprocessing(self) -> None: + """Warm up audio processing libraries to avoid first-request latency. + + The first call to librosa functions (load, get_duration, mel-spectrogram) + triggers JIT compilation and library initialization which can take ~7s. + This method warms up these operations during server initialization. + """ + # Skip warmup if librosa is not installed (optional dependency) + if isinstance(librosa, PlaceholderModule): + return + + # Skip warmup if model doesn't support transcription + if not supports_transcription(self.model_cls): + return + + try: + warmup_start = time.perf_counter() + logger.info("Warming up audio preprocessing libraries...") + + # Create a minimal dummy audio (1 second of silence at target sample rate) + dummy_audio = np.zeros(int(self.asr_config.sample_rate), dtype=np.float32) + + # Warm up librosa.load by using librosa functions on the dummy data + # This initializes FFTW, numba JIT, and other audio processing libraries + _ = librosa.get_duration(y=dummy_audio, sr=self.asr_config.sample_rate) + + # Warm up mel-spectrogram computation with model-specific parameters + from vllm.transformers_utils.processor import ( + cached_processor_from_config, + ) + + processor = cached_processor_from_config(self.model_config) + feature_extractor = None + if hasattr(processor, "feature_extractor"): + feature_extractor = processor.feature_extractor + elif hasattr(processor, "audio_processor"): + # For models like GraniteSpeech that use audio_processor + audio_proc = processor.audio_processor + if hasattr(audio_proc, "feature_extractor"): + feature_extractor = audio_proc.feature_extractor + # If audio_processor doesn't have feature_extractor, + # skip mel-spectrogram warmup for these models + + if feature_extractor is not None: + _ = librosa.feature.melspectrogram( + y=dummy_audio, + sr=self.asr_config.sample_rate, + n_mels=getattr(feature_extractor, "n_mels", 128), + n_fft=getattr(feature_extractor, "n_fft", 400), + hop_length=getattr(feature_extractor, "hop_length", 160), + ) + + warmup_elapsed = time.perf_counter() - warmup_start + logger.info("Audio preprocessing warmup completed in %.2fs", warmup_elapsed) + except Exception: + # Don't fail initialization if warmup fails - log exception and continue + logger.exception( + "Audio preprocessing warmup failed (non-fatal): %s. " + "First request may experience higher latency.", + ) + + def _warmup_input_processor(self) -> None: + """Warm up input processor with dummy audio to avoid first-request latency. + + The first call to input_processor.process_inputs() with multimodal audio + triggers multimodal processing initialization which can take ~2.5s. + This method processes a dummy audio request to warm up the pipeline. + """ + # Skip warmup if model doesn't support transcription + if not supports_transcription(self.model_cls): + return + + # Only warm up if model supports transcription methods + if not hasattr(self.model_cls, "get_generation_prompt"): + return + + try: + from vllm.sampling_params import SamplingParams + + warmup_start = time.perf_counter() + logger.info("Warming up multimodal input processor...") + + # Create minimal dummy audio (1 second of silence) + dummy_audio = np.zeros(int(self.asr_config.sample_rate), dtype=np.float32) + + # Use the same method that _preprocess_speech_to_text uses + # to create the prompt + dummy_prompt = self.model_cls.get_generation_prompt( + audio=dummy_audio, + stt_config=self.asr_config, + model_config=self.model_config, + language="en", + task_type=self.task_type, + request_prompt="", + to_language=None, + ) + + # Create minimal sampling params + dummy_params = SamplingParams( + max_tokens=1, + temperature=0.0, + ) + + # Process the dummy input through the input processor + # This will trigger all the multimodal processing initialization + _ = self.input_processor.process_inputs( + request_id="warmup", + prompt=dummy_prompt, + params=dummy_params, + ) + + warmup_elapsed = time.perf_counter() - warmup_start + logger.info("Input processor warmup completed in %.2fs", warmup_elapsed) + except Exception: + # Don't fail initialization if warmup fails - log warning and continue + logger.exception( + "Input processor warmup failed (non-fatal): %s. " + "First request may experience higher latency." + ) + @cached_property def model_cls(self) -> type[SupportsTranscription]: from vllm.model_executor.model_loader import get_model_cls From e3ab93c89667bcd026582e74bc7c49774724a273 Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Thu, 18 Dec 2025 14:36:49 +0800 Subject: [PATCH 051/176] [CPU] Refactor CPU fused MOE (#30531) Signed-off-by: jiang1.li --- .../scripts/hardware_ci/run-cpu-test.sh | 1 + cmake/cpu_extension.cmake | 4 +- .../{cpu_attn_macros.h => cpu_arch_macros.h} | 10 +- csrc/cpu/cpu_attn_impl.hpp | 41 +- csrc/cpu/cpu_fused_moe.cpp | 727 ++++++++++++++++++ csrc/cpu/cpu_types_x86.hpp | 8 + csrc/cpu/cpu_wna16.cpp | 18 +- csrc/cpu/dnnl_helper.cpp | 12 +- csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp | 33 + csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp | 38 + csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp | 19 + csrc/cpu/scratchpad_manager.cpp | 23 - csrc/cpu/scratchpad_manager.h | 31 - csrc/cpu/torch_bindings.cpp | 24 + csrc/cpu/utils.cpp | 24 +- csrc/cpu/utils.hpp | 89 ++- docker/Dockerfile.cpu | 4 +- requirements/cpu-build.txt | 2 +- requirements/cpu.txt | 2 + tests/kernels/moe/test_cpu_fused_moe.py | 172 +++++ vllm/_custom_ops.py | 36 + .../layers/fused_moe/cpu_fused_moe.py | 264 +++++-- vllm/model_executor/layers/fused_moe/layer.py | 6 +- 23 files changed, 1388 insertions(+), 200 deletions(-) rename csrc/cpu/{cpu_attn_macros.h => cpu_arch_macros.h} (97%) create mode 100644 csrc/cpu/cpu_fused_moe.cpp delete mode 100644 csrc/cpu/scratchpad_manager.cpp delete mode 100644 csrc/cpu/scratchpad_manager.h create mode 100644 tests/kernels/moe/test_cpu_fused_moe.py diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index 438fe522c8702..471c8616df85c 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -50,6 +50,7 @@ function cpu_tests() { docker exec cpu-test-"$NUMA_NODE" bash -c " set -e pytest -x -v -s tests/kernels/attention/test_cpu_attn.py + pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py pytest -x -v -s tests/kernels/test_onednn.py" # Run basic model test diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index 85b286f8d8d0a..0af87fd7f0b53 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -330,7 +330,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON PUBLIC ${oneDNN_BINARY_DIR}/include PRIVATE ${oneDNN_SOURCE_DIR}/src ) - target_link_libraries(dnnl_ext dnnl) + target_link_libraries(dnnl_ext dnnl torch) target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS} -fPIC) list(APPEND LIBS dnnl_ext) set(USE_ONEDNN ON) @@ -358,13 +358,13 @@ set(VLLM_EXT_SRC "csrc/cpu/pos_encoding.cpp" "csrc/moe/dynamic_4bit_int_moe_cpu.cpp" "csrc/cpu/cpu_attn.cpp" - "csrc/cpu/scratchpad_manager.cpp" "csrc/cpu/torch_bindings.cpp") if (AVX512_FOUND AND NOT AVX512_DISABLED) set(VLLM_EXT_SRC "csrc/cpu/shm.cpp" "csrc/cpu/cpu_wna16.cpp" + "csrc/cpu/cpu_fused_moe.cpp" ${VLLM_EXT_SRC}) if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI) set(VLLM_EXT_SRC diff --git a/csrc/cpu/cpu_attn_macros.h b/csrc/cpu/cpu_arch_macros.h similarity index 97% rename from csrc/cpu/cpu_attn_macros.h rename to csrc/cpu/cpu_arch_macros.h index 35716a0790ab3..c73b62ecdec90 100644 --- a/csrc/cpu/cpu_attn_macros.h +++ b/csrc/cpu/cpu_arch_macros.h @@ -1,5 +1,5 @@ -#ifndef CPU_ATTN_MACROS_H -#define CPU_ATTN_MACROS_H +#ifndef CPU_ARCH_MACROS_H +#define CPU_ARCH_MACROS_H // x86_64 #ifdef __x86_64__ @@ -26,7 +26,7 @@ _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218)); \ const __m512i vec_127 = _mm512_set1_epi32(0x0000007f); \ const int n_mantissa_bits = 23; \ - auto fast_exp = [&](vec_op::FP32Vec16& vec) __attribute__(( \ + auto fast_exp = [&](const vec_op::FP32Vec16& vec) __attribute__(( \ always_inline)) { \ __m512 values = vec.reg; \ auto less_ln_flt_min_mask = \ @@ -98,7 +98,7 @@ poly = vbslq_f32(hi_mask, inf, poly); \ return vbslq_f32(lo_mask, zero, poly); \ }; \ - auto fast_exp = [&](vec_op::FP32Vec16& vec) \ + auto fast_exp = [&](const vec_op::FP32Vec16& vec) \ __attribute__((always_inline)) { \ float32x4x4_t result; \ result.val[0] = neon_expf(vec.reg.val[0]); \ @@ -110,4 +110,4 @@ #endif // __aarch64__ -#endif \ No newline at end of file +#endif diff --git a/csrc/cpu/cpu_attn_impl.hpp b/csrc/cpu/cpu_attn_impl.hpp index e3e077b845f4f..08d208e05a62c 100644 --- a/csrc/cpu/cpu_attn_impl.hpp +++ b/csrc/cpu/cpu_attn_impl.hpp @@ -8,10 +8,8 @@ #include #endif -#include "cpu_types.hpp" -#include "scratchpad_manager.h" -#include "cpu_attn_macros.h" -#include "utils.hpp" +#include "cpu/cpu_arch_macros.h" +#include "cpu/utils.hpp" namespace cpu_attention { enum class ISA { AMX, VEC, VEC16, NEON }; @@ -378,12 +376,13 @@ class AttentionScheduler { static constexpr int32_t MaxQTileIterNum = 128; - AttentionScheduler() : available_cache_size_(get_available_l2_size()) {} + AttentionScheduler() + : available_cache_size_(cpu_utils::get_available_l2_size()) {} torch::Tensor schedule(const ScheduleInput& input) const { const bool casual = input.casual; const int32_t thread_num = omp_get_max_threads(); - const int64_t cache_size = get_available_l2_size(); + const int64_t cache_size = cpu_utils::get_available_l2_size(); const int32_t max_num_q_per_iter = input.max_num_q_per_iter; const int32_t kv_len_alignment = input.kv_block_alignment; int32_t q_head_per_kv = input.num_heads_q / input.num_heads_kv; @@ -659,7 +658,7 @@ class AttentionScheduler { metadata_ptr->thread_num + metadata_ptr->reduction_scratchpad_size_per_kv_head * (use_gqa ? input.num_heads_kv : input.num_heads_q); - DNNLScratchPadManager::get_dnnl_scratchpad_manager()->realloc( + cpu_utils::ScratchPadManager::get_scratchpad_manager()->realloc( scratchpad_size); // metadata_ptr->print(); @@ -667,7 +666,7 @@ class AttentionScheduler { // test out of boundary access // { // float* cache_ptr = - // DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data(); + // cpu_utils::ScratchPadManager::getl_scratchpad_manager()->get_data(); // for (int64_t i = 0; i < scratchpad_size / sizeof(float); ++i) { // cache_ptr[i] = std::numeric_limits::quiet_NaN(); // } @@ -749,27 +748,6 @@ class AttentionScheduler { return std::max(rounded_tile_size, round_size); } - static int64_t get_available_l2_size() { - static int64_t size = []() { -#if defined(__APPLE__) - // macOS doesn't have _SC_LEVEL2_CACHE_SIZE. Use sysctlbyname. - int64_t l2_cache_size = 0; - size_t len = sizeof(l2_cache_size); - if (sysctlbyname("hw.l2cachesize", &l2_cache_size, &len, NULL, 0) == 0 && - l2_cache_size > 0) { - return l2_cache_size >> 1; // use 50% of L2 cache - } - // Fallback if sysctlbyname fails - return 128LL * 1024 >> 1; // use 50% of 128KB -#else - long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE); - TORCH_CHECK_NE(l2_cache_size, -1); - return l2_cache_size >> 1; // use 50% of L2 cache -#endif - }(); - return size; - } - private: int64_t available_cache_size_; }; @@ -1402,7 +1380,7 @@ class AttentionMainLoop { // init buffers void* scratchpad_ptr = - DNNLScratchPadManager::get_dnnl_scratchpad_manager() + cpu_utils::ScratchPadManager::get_scratchpad_manager() ->get_data(); AttentionScratchPad buffer_manager(thread_id, metadata, scratchpad_ptr); @@ -1422,8 +1400,7 @@ class AttentionMainLoop { } } - const int64_t available_cache_size = - AttentionScheduler::get_available_l2_size(); + const int64_t available_cache_size = cpu_utils::get_available_l2_size(); const int32_t default_tile_size = AttentionScheduler::calcu_default_tile_size( available_cache_size, head_dim, sizeof(kv_cache_t), diff --git a/csrc/cpu/cpu_fused_moe.cpp b/csrc/cpu/cpu_fused_moe.cpp new file mode 100644 index 0000000000000..090e2d4cd4b56 --- /dev/null +++ b/csrc/cpu/cpu_fused_moe.cpp @@ -0,0 +1,727 @@ +#include "cpu/cpu_types.hpp" +#include "cpu/utils.hpp" +#include "cpu/micro_gemm/cpu_micro_gemm_vec.hpp" +#include "cpu/cpu_arch_macros.h" + +#ifdef CPU_CAPABILITY_AMXBF16 + #include "cpu/micro_gemm/cpu_micro_gemm_amx.hpp" + #define AMX_DISPATCH(...) \ + case cpu_utils::ISA::AMX: { \ + using gemm_t = cpu_micro_gemm::MicroGemm; \ + return __VA_ARGS__(); \ + } +#else + #define AMX_DISPATCH(...) case cpu_utils::ISA::AMX: +#endif + +#define CPU_ISA_DISPATCH_IMPL(ISA_TYPE, ...) \ + [&] { \ + switch (ISA_TYPE) { \ + AMX_DISPATCH(__VA_ARGS__) \ + case cpu_utils::ISA::VEC: { \ + using gemm_t = \ + cpu_micro_gemm::MicroGemm; \ + return __VA_ARGS__(); \ + } \ + default: { \ + TORCH_CHECK(false, "Invalid CPU ISA type."); \ + } \ + } \ + }() + +namespace { +enum class FusedMOEAct { SiluAndMul, SwigluOAIAndMul }; + +FusedMOEAct get_act_type(const std::string& act) { + if (act == "silu") { + return FusedMOEAct::SiluAndMul; + } else if (act == "swigluoai") { + return FusedMOEAct::SwigluOAIAndMul; + } else { + TORCH_CHECK(false, "Invalid act type: " + act); + } +} + +template +void swigluoai_and_mul(float* __restrict__ input, scalar_t* __restrict__ output, + const int32_t m_size, const int32_t n_size, + const int32_t input_stride, + const int32_t output_stride) { + using scalar_vec_t = typename cpu_utils::VecTypeTrait::vec_t; + // For GPT-OSS interleaved gate-up weights + alignas(64) static int32_t index[16] = {0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30}; + vec_op::INT32Vec16 index_vec(index); + vec_op::FP32Vec16 gate_up_max_vec(7.0); + vec_op::FP32Vec16 up_min_vec(-7.0); + vec_op::FP32Vec16 alpha_vec(1.702); + vec_op::FP32Vec16 one_vec(1.0); + + DEFINE_FAST_EXP + + for (int32_t m = 0; m < m_size; ++m) { + for (int32_t n = 0; n < n_size; n += 32) { + vec_op::FP32Vec16 gate_vec(input + n, index_vec); + vec_op::FP32Vec16 up_vec(input + n + 1, index_vec); + gate_vec = gate_vec.min(gate_up_max_vec); + up_vec = up_vec.clamp(up_min_vec, gate_up_max_vec); + auto sigmoid_vec = one_vec / (one_vec + fast_exp(-gate_vec * alpha_vec)); + auto glu = gate_vec * sigmoid_vec; + auto gated_output_fp32 = (one_vec + up_vec) * glu; + scalar_vec_t gated_output = scalar_vec_t(gated_output_fp32); + gated_output.save(output + n / 2); + } + input += input_stride; + output += output_stride; + } +} + +template +void silu_and_mul(float* __restrict__ input, scalar_t* __restrict__ output, + const int32_t m_size, const int32_t n_size, + const int32_t input_stride, const int32_t output_stride) { + using scalar_vec_t = typename cpu_utils::VecTypeTrait::vec_t; + const int32_t dim = n_size / 2; + float* __restrict__ gate = input; + float* __restrict__ up = input + dim; + vec_op::FP32Vec16 one_vec(1.0); + + DEFINE_FAST_EXP + + for (int32_t m = 0; m < m_size; ++m) { + for (int32_t n = 0; n < dim; n += 16) { + vec_op::FP32Vec16 gate_vec(gate + n); + vec_op::FP32Vec16 up_vec(up + n); + auto sigmoid_vec = one_vec / (one_vec + fast_exp(-gate_vec)); + auto silu = gate_vec * sigmoid_vec; + auto gated_output_fp32 = up_vec * silu; + scalar_vec_t gated_output = scalar_vec_t(gated_output_fp32); + gated_output.save(output + n); + } + gate += input_stride; + up += input_stride; + output += output_stride; + } +} + +template +FORCE_INLINE void apply_gated_act(const FusedMOEAct act, + float* __restrict__ input, + scalar_t* __restrict__ output, + const int32_t m, const int32_t n, + const int32_t input_stride, + const int32_t output_stride) { + switch (act) { + case FusedMOEAct::SwigluOAIAndMul: + swigluoai_and_mul(input, output, m, n, input_stride, output_stride); + return; + case FusedMOEAct::SiluAndMul: + silu_and_mul(input, output, m, n, input_stride, output_stride); + return; + default: + TORCH_CHECK(false, "Unsupported act type."); + } +} + +template +void prepack_moe_weight_impl(scalar_t* __restrict__ weight_ptr, + scalar_t* __restrict__ packed_weight_ptr, + const int32_t expert_num, + const int32_t output_size, + const int32_t input_size, + const int64_t expert_stride) { +#pragma omp parallel for + for (int32_t e_idx = 0; e_idx < expert_num; ++e_idx) { + gemm_t::pack_weight(weight_ptr + expert_stride * e_idx, + packed_weight_ptr + expert_stride * e_idx, output_size, + input_size); + } +} + +template +void fused_moe_impl(scalar_t* __restrict__ output, scalar_t* __restrict__ input, + w_t* __restrict__ w13, w_t* __restrict__ w2, + w_t* __restrict__ w13_bias, w_t* __restrict__ w2_bias, + float* __restrict__ topk_weights, + int32_t* __restrict__ topk_id, FusedMOEAct act_type, + const int32_t token_num, const int32_t expert_num, + const int32_t topk_num, const int32_t input_size_13, + const int32_t output_size_13, const int32_t input_size_2, + const int32_t output_size_2) { + using scalar_vec_t = typename cpu_utils::VecTypeTrait::vec_t; + constexpr int32_t gemm_n_tile_size = gemm_t::NSize; + constexpr int32_t gemm_m_tile_size = gemm_t::MaxMSize; + constexpr int32_t min_w13_n_tile_size = 2 * gemm_n_tile_size; + static_assert(gemm_n_tile_size % 16 == 0); + + TORCH_CHECK_EQ(output_size_13 % min_w13_n_tile_size, 0); + TORCH_CHECK_EQ(output_size_2 % gemm_n_tile_size, 0); + TORCH_CHECK_EQ(output_size_13 / 2, input_size_2); + + const int32_t thread_num = omp_get_max_threads(); + + const int32_t w13_input_buffer_size = cpu_utils::round_up<64>( + gemm_m_tile_size * input_size_13 * sizeof(scalar_t)); + + const int32_t w13_n_tile_size = [&]() { + const int64_t cache_size = cpu_utils::get_available_l2_size(); + // input buffer + output buffer + weight + const int32_t n_size_cache_limit = + (cache_size - w13_input_buffer_size) / + (gemm_m_tile_size * sizeof(float) + input_size_13 * sizeof(scalar_t)); + const int32_t n_size_thread_limit = + output_size_13 / std::max(1, thread_num / topk_num); + const int32_t n_size = cpu_utils::round_down( + std::min(n_size_cache_limit, n_size_thread_limit)); + return std::max(n_size, min_w13_n_tile_size); + }(); + + const int32_t w2_input_tile_size = cpu_utils::round_up<64>( + gemm_m_tile_size * input_size_2 * sizeof(scalar_t)); + + const int32_t w2_n_tile_size = [&]() { + const int64_t cache_size = cpu_utils::get_available_l2_size(); + // input tile + weight + const int32_t n_size_cache_limit = + (cache_size - w2_input_tile_size) / (input_size_2 * sizeof(scalar_t)); + const int32_t n_size_thread_limit = + output_size_2 / std::max(1, thread_num / topk_num); + const int32_t n_size = cpu_utils::round_down( + std::min(n_size_cache_limit, n_size_thread_limit)); + return std::max(n_size, gemm_n_tile_size); + }(); + + // allocate buffers + int32_t common_buffer_offset = 0; + int32_t w13_thread_buffer_offset = 0; + int32_t ws_thread_buffer_offset = 0; + + // common buffers + const int32_t token_num_per_group_buffer_size = + cpu_utils::round_up<64>(expert_num * sizeof(int32_t)); + const int32_t token_num_per_group_buffer_offset = common_buffer_offset; + common_buffer_offset += token_num_per_group_buffer_size; + + const int32_t cu_token_num_per_group_buffer_size = + cpu_utils::round_up<64>((expert_num + 1) * sizeof(int32_t)); + const int32_t cu_token_num_per_group_buffer_offset = common_buffer_offset; + common_buffer_offset += cu_token_num_per_group_buffer_size; + + const int32_t expand_token_id_buffer_size = + cpu_utils::round_up<64>(token_num * topk_num * sizeof(int32_t)); + const int32_t expand_token_id_buffer_offset = common_buffer_offset; + common_buffer_offset += expand_token_id_buffer_size; + + const int32_t expand_token_id_index_buffer_size = + cpu_utils::round_up<64>(token_num * topk_num * sizeof(int32_t)); + const int32_t expand_token_id_index_buffer_offset = common_buffer_offset; + common_buffer_offset += expand_token_id_index_buffer_size; + + const int32_t w13_gemm_output_buffer_size = cpu_utils::round_up<64>( + token_num * topk_num * (output_size_13 / 2) * sizeof(scalar_t)); + const int32_t w13_gemm_output_buffer_offset = common_buffer_offset; + common_buffer_offset += w13_gemm_output_buffer_size; + + const int32_t w2_gemm_output_buffer_size = cpu_utils::round_up<64>( + token_num * topk_num * output_size_2 * sizeof(float)); + const int32_t w2_gemm_output_buffer_offset = common_buffer_offset; + common_buffer_offset += w2_gemm_output_buffer_size; + + // w13 GEMM thread buffers + const int32_t w13_input_buffer_offset = w13_thread_buffer_offset; + w13_thread_buffer_offset += w13_input_buffer_size; + + const int32_t w13_output_buffer_size = cpu_utils::round_up<64>( + gemm_m_tile_size * w13_n_tile_size * sizeof(float)); + const int32_t w13_output_buffer_offset = w13_thread_buffer_offset; + w13_thread_buffer_offset += w13_output_buffer_size; + + // Weighted sum thread buffer + const int32_t ws_output_buffer_size = + cpu_utils::round_up<64>(output_size_2 * sizeof(float)); + const int32_t ws_output_buffer_offset = ws_thread_buffer_offset; + ws_thread_buffer_offset += ws_output_buffer_size; + + const int32_t buffer_size = + common_buffer_offset + + std::max(w13_thread_buffer_offset, ws_thread_buffer_offset) * thread_num; + cpu_utils::ScratchPadManager::get_scratchpad_manager()->realloc(buffer_size); + uint8_t* common_buffer_start = + cpu_utils::ScratchPadManager::get_scratchpad_manager() + ->get_data(); + uint8_t* thread_buffer_start = common_buffer_start + common_buffer_offset; + + int32_t* __restrict__ token_num_per_group_buffer = reinterpret_cast( + common_buffer_start + token_num_per_group_buffer_offset); + int32_t* __restrict__ cu_token_num_per_group_buffer = + reinterpret_cast(common_buffer_start + + cu_token_num_per_group_buffer_offset); + int32_t* __restrict__ expand_token_id_buffer = reinterpret_cast( + common_buffer_start + expand_token_id_buffer_offset); + int32_t* __restrict__ expand_token_id_index_buffer = + reinterpret_cast(common_buffer_start + + expand_token_id_index_buffer_offset); + + // prepare token-expert mappings + { + std::memset(token_num_per_group_buffer, 0, expert_num * sizeof(int32_t)); + for (int32_t i = 0; i < token_num * topk_num; ++i) { + int32_t curr_expert_id = topk_id[i]; + ++token_num_per_group_buffer[curr_expert_id]; + } + + int32_t token_num_sum = 0; + cu_token_num_per_group_buffer[0] = 0; + int32_t* token_index_buffer = cu_token_num_per_group_buffer + 1; + for (int32_t i = 0; i < expert_num; ++i) { + token_index_buffer[i] = token_num_sum; + token_num_sum += token_num_per_group_buffer[i]; + } + + for (int32_t i = 0; i < token_num; ++i) { + int32_t* curr_topk_id = topk_id + i * topk_num; + int32_t* curr_index_buffer = expand_token_id_index_buffer + i * topk_num; + for (int32_t j = 0; j < topk_num; ++j) { + int32_t curr_expert_id = curr_topk_id[j]; + int32_t curr_index = token_index_buffer[curr_expert_id]; + ++token_index_buffer[curr_expert_id]; + expand_token_id_buffer[curr_index] = i; + curr_index_buffer[j] = curr_index; + } + } + } + + // w13 GEMM + act + { + alignas(64) cpu_utils::Counter counter; + cpu_utils::Counter* counter_ptr = &counter; + +#pragma omp parallel for schedule(static, 1) + for (int32_t thread_id = 0; thread_id < thread_num; ++thread_id) { + const int32_t task_num_per_expert = + (output_size_13 + w13_n_tile_size - 1) / w13_n_tile_size; + const int32_t task_num = task_num_per_expert * expert_num; + + uint8_t* __restrict__ thread_buffer = + thread_buffer_start + thread_id * w13_thread_buffer_offset; + scalar_t* __restrict__ w13_input_buffer = + reinterpret_cast(thread_buffer + w13_input_buffer_offset); + float* __restrict__ w13_output_buffer = + reinterpret_cast(thread_buffer + w13_output_buffer_offset); + scalar_t* __restrict__ w13_gemm_output_buffer = + reinterpret_cast(common_buffer_start + + w13_gemm_output_buffer_offset); + + gemm_t gemm; + + const int32_t input_size_13_bytes = input_size_13 * sizeof(scalar_t); + const int32_t w13_n_group_stride = 16 * input_size_13; + const int32_t w13_n_tile_stride = gemm_n_tile_size * input_size_13; + + for (;;) { + int32_t task_id = counter_ptr->acquire_counter(); + if (task_id >= task_num) { + break; + } + + const int32_t curr_expert_id = task_id / task_num_per_expert; + const int32_t curr_output_group_id = task_id % task_num_per_expert; + const int32_t curr_token_num = + token_num_per_group_buffer[curr_expert_id]; + if (curr_token_num == 0) { + continue; + } + + const int32_t actual_n_tile_size = + std::min(w13_n_tile_size, + output_size_13 - curr_output_group_id * w13_n_tile_size); + const int32_t* __restrict__ curr_expand_token_id_buffer = + expand_token_id_buffer + + cu_token_num_per_group_buffer[curr_expert_id]; + scalar_t* __restrict__ curr_w13_gemm_output_buffer = + w13_gemm_output_buffer + + cu_token_num_per_group_buffer[curr_expert_id] * + (output_size_13 / 2) + + curr_output_group_id * w13_n_tile_size / 2; + + w_t* __restrict__ w13_weight_ptr_0 = nullptr; + w_t* __restrict__ w13_weight_ptr_1 = nullptr; + w_t* __restrict__ w13_bias_ptr_0 = nullptr; + w_t* __restrict__ w13_bias_ptr_1 = nullptr; + if (act_type == FusedMOEAct::SwigluOAIAndMul) { + // For SwigluOAIAndMul, up and down weights are interleaved + w13_weight_ptr_0 = + w13 + curr_expert_id * input_size_13 * output_size_13 + + curr_output_group_id * w13_n_tile_size * input_size_13; + w13_weight_ptr_1 = + w13_weight_ptr_0 + actual_n_tile_size / 2 * input_size_13; + if (w13_bias != nullptr) { + w13_bias_ptr_0 = w13_bias + curr_expert_id * output_size_13 + + curr_output_group_id * w13_n_tile_size; + w13_bias_ptr_1 = w13_bias_ptr_0 + actual_n_tile_size / 2; + } + } else { + w13_weight_ptr_0 = + w13 + curr_expert_id * input_size_13 * output_size_13 + + curr_output_group_id * (w13_n_tile_size / 2) * input_size_13; + w13_weight_ptr_1 = + w13_weight_ptr_0 + output_size_13 / 2 * input_size_13; + if (w13_bias != nullptr) { + w13_bias_ptr_0 = w13_bias + curr_expert_id * output_size_13 + + curr_output_group_id * (w13_n_tile_size / 2); + w13_bias_ptr_1 = w13_bias_ptr_0 + output_size_13 / 2; + } + } + + scalar_t* __restrict__ curr_w13_input_buffer = w13_input_buffer; + for (int32_t token_idx = 0; token_idx < curr_token_num; + token_idx += gemm_m_tile_size) { + const int32_t actual_token_num = + std::min(gemm_m_tile_size, curr_token_num - token_idx); + // copy inputs + { + scalar_t* __restrict__ curr_w13_input_buffer_iter = + curr_w13_input_buffer; + for (int32_t i = 0; i < actual_token_num; ++i) { + const int32_t curr_token_id = curr_expand_token_id_buffer[i]; + int8_t* __restrict__ curr_input_iter = reinterpret_cast( + input + curr_token_id * input_size_13); + int8_t* __restrict__ curr_output_iter = + reinterpret_cast(curr_w13_input_buffer_iter); + int32_t j = 0; + for (; j < input_size_13_bytes - 64; j += 64) { + vec_op::INT8Vec64 vec(curr_input_iter); + vec.save(curr_output_iter); + curr_input_iter += 64; + curr_output_iter += 64; + } + vec_op::INT8Vec64 vec(curr_input_iter); + vec.save(curr_output_iter, input_size_13_bytes - j); + + // update + curr_w13_input_buffer_iter += input_size_13; + } + // update + curr_expand_token_id_buffer += actual_token_num; + } + + // gemm + act + { + scalar_t* __restrict__ w13_weight_ptr_0_iter = w13_weight_ptr_0; + scalar_t* __restrict__ w13_weight_ptr_1_iter = w13_weight_ptr_1; + scalar_t* __restrict__ w13_bias_ptr_0_iter = w13_bias_ptr_0; + scalar_t* __restrict__ w13_bias_ptr_1_iter = w13_bias_ptr_1; + scalar_t* __restrict__ curr_w13_input_buffer_iter = + curr_w13_input_buffer; + float* __restrict__ w13_output_buffer_0_iter = w13_output_buffer; + float* __restrict__ w13_output_buffer_1_iter = + w13_output_buffer + actual_n_tile_size / 2; + for (int32_t i = 0; i < actual_n_tile_size; + i += min_w13_n_tile_size) { + gemm.gemm(curr_w13_input_buffer_iter, w13_weight_ptr_0_iter, + w13_output_buffer_0_iter, actual_token_num, + input_size_13, input_size_13, w13_n_group_stride, + actual_n_tile_size, false); + + if (w13_bias != nullptr) { + cpu_micro_gemm::add_bias_epilogue( + w13_output_buffer_0_iter, w13_output_buffer_0_iter, + w13_bias_ptr_0_iter, actual_token_num, actual_n_tile_size, + actual_n_tile_size); + w13_bias_ptr_0_iter += gemm_n_tile_size; + } + + gemm.gemm(curr_w13_input_buffer_iter, w13_weight_ptr_1_iter, + w13_output_buffer_1_iter, actual_token_num, + input_size_13, input_size_13, w13_n_group_stride, + actual_n_tile_size, false); + + if (w13_bias != nullptr) { + cpu_micro_gemm::add_bias_epilogue( + w13_output_buffer_1_iter, w13_output_buffer_1_iter, + w13_bias_ptr_1_iter, actual_token_num, actual_n_tile_size, + actual_n_tile_size); + w13_bias_ptr_1_iter += gemm_n_tile_size; + } + + // update + w13_weight_ptr_0_iter += w13_n_tile_stride; + w13_weight_ptr_1_iter += w13_n_tile_stride; + w13_output_buffer_0_iter += gemm_n_tile_size; + w13_output_buffer_1_iter += gemm_n_tile_size; + } + + apply_gated_act(act_type, w13_output_buffer, + curr_w13_gemm_output_buffer, actual_token_num, + actual_n_tile_size, actual_n_tile_size, + output_size_13 / 2); + + // update + curr_w13_gemm_output_buffer += + gemm_m_tile_size * (output_size_13 / 2); + } + } + } + } + } + + // w2 GEMM + { + alignas(64) cpu_utils::Counter counter; + cpu_utils::Counter* counter_ptr = &counter; + +#pragma omp parallel for schedule(static, 1) + for (int32_t thread_id = 0; thread_id < thread_num; ++thread_id) { + const int32_t task_num_per_expert = + (output_size_2 + w2_n_tile_size - 1) / w2_n_tile_size; + const int32_t task_num = task_num_per_expert * expert_num; + scalar_t* __restrict__ w13_gemm_output_buffer = + reinterpret_cast(common_buffer_start + + w13_gemm_output_buffer_offset); + float* __restrict__ w2_gemm_output_buffer = reinterpret_cast( + common_buffer_start + w2_gemm_output_buffer_offset); + + gemm_t gemm; + + const int32_t w2_n_tile_stride = gemm_n_tile_size * input_size_2; + const int32_t w2_n_group_stride = 16 * input_size_2; + + for (;;) { + int32_t task_id = counter_ptr->acquire_counter(); + if (task_id >= task_num) { + break; + } + + const int32_t curr_expert_id = task_id / task_num_per_expert; + const int32_t curr_output_group_id = task_id % task_num_per_expert; + const int32_t curr_token_num = + token_num_per_group_buffer[curr_expert_id]; + if (curr_token_num == 0) { + continue; + } + + const int32_t actual_n_tile_size = + std::min(w2_n_tile_size, + output_size_2 - curr_output_group_id * w2_n_tile_size); + scalar_t* __restrict__ curr_w13_gemm_output_buffer = + w13_gemm_output_buffer + + cu_token_num_per_group_buffer[curr_expert_id] * input_size_2; + float* __restrict__ curr_w2_gemm_output_buffer = + w2_gemm_output_buffer + + cu_token_num_per_group_buffer[curr_expert_id] * output_size_2 + + curr_output_group_id * w2_n_tile_size; + scalar_t* __restrict__ w2_weight_ptr = + w2 + curr_expert_id * output_size_2 * input_size_2 + + curr_output_group_id * w2_n_tile_size * input_size_2; + scalar_t* __restrict__ w2_bias_ptr = nullptr; + if (w2_bias != nullptr) { + w2_bias_ptr = w2_bias + curr_expert_id * output_size_2 + + curr_output_group_id * w2_n_tile_size; + } + + for (int32_t token_idx = 0; token_idx < curr_token_num; + token_idx += gemm_m_tile_size) { + const int32_t actual_token_num = + std::min(gemm_m_tile_size, curr_token_num - token_idx); + + scalar_t* __restrict__ w2_weight_ptr_iter = w2_weight_ptr; + scalar_t* __restrict__ w2_bias_ptr_iter = w2_bias_ptr; + float* __restrict__ curr_w2_gemm_output_buffer_iter = + curr_w2_gemm_output_buffer; + for (int32_t i = 0; i < actual_n_tile_size; i += gemm_n_tile_size) { + gemm.gemm(curr_w13_gemm_output_buffer, w2_weight_ptr_iter, + curr_w2_gemm_output_buffer_iter, actual_token_num, + input_size_2, input_size_2, w2_n_group_stride, + output_size_2, false); + + if (w2_bias != nullptr) { + cpu_micro_gemm::add_bias_epilogue( + curr_w2_gemm_output_buffer_iter, + curr_w2_gemm_output_buffer_iter, w2_bias_ptr_iter, + actual_token_num, output_size_2, output_size_2); + w2_bias_ptr_iter += gemm_n_tile_size; + } + + w2_weight_ptr_iter += w2_n_tile_stride; + curr_w2_gemm_output_buffer_iter += gemm_n_tile_size; + } + + // update + curr_w13_gemm_output_buffer += gemm_m_tile_size * input_size_2; + curr_w2_gemm_output_buffer += gemm_m_tile_size * output_size_2; + } + } + } + } + + // weighted sum + { + alignas(64) cpu_utils::Counter counter; + cpu_utils::Counter* counter_ptr = &counter; + +#pragma omp parallel for schedule(static, 1) + for (int32_t thread_id = 0; thread_id < thread_num; ++thread_id) { + const int32_t task_num = token_num; + uint8_t* __restrict__ thread_buffer = + thread_buffer_start + thread_id * ws_thread_buffer_offset; + float* __restrict__ ws_output_buffer = + reinterpret_cast(thread_buffer + ws_output_buffer_offset); + float* __restrict__ w2_gemm_output_buffer = reinterpret_cast( + common_buffer_start + w2_gemm_output_buffer_offset); + + for (;;) { + int32_t task_id = counter_ptr->acquire_counter(); + if (task_id >= task_num) { + break; + } + + int32_t token_id = task_id; + int32_t* __restrict__ curr_expand_token_id_index_buffer = + expand_token_id_index_buffer + token_id * topk_num; + float* __restrict__ curr_weight = topk_weights + token_id * topk_num; + scalar_t* __restrict__ curr_output_buffer = + output + token_id * output_size_2; + + if (topk_num > 1) { + { + int32_t w2_output_idx = curr_expand_token_id_index_buffer[0]; + float* __restrict__ w2_output_iter = + w2_gemm_output_buffer + w2_output_idx * output_size_2; + float* __restrict__ ws_output_buffer_iter = ws_output_buffer; + vec_op::FP32Vec16 weight_vec(curr_weight[0]); + for (int32_t i = 0; i < output_size_2; i += 16) { + vec_op::FP32Vec16 vec(w2_output_iter); + vec = vec * weight_vec; + vec.save(ws_output_buffer_iter); + + // update + w2_output_iter += 16; + ws_output_buffer_iter += 16; + } + } + + { + for (int32_t idx = 1; idx < topk_num - 1; ++idx) { + int32_t w2_output_idx = curr_expand_token_id_index_buffer[idx]; + float* __restrict__ w2_output_iter = + w2_gemm_output_buffer + w2_output_idx * output_size_2; + float* __restrict__ ws_output_buffer_iter = ws_output_buffer; + vec_op::FP32Vec16 weight_vec(curr_weight[idx]); + for (int32_t i = 0; i < output_size_2; i += 16) { + vec_op::FP32Vec16 vec(w2_output_iter); + vec_op::FP32Vec16 sum(ws_output_buffer_iter); + sum = sum + vec * weight_vec; + sum.save(ws_output_buffer_iter); + + // update + w2_output_iter += 16; + ws_output_buffer_iter += 16; + } + } + } + + { + int32_t idx = topk_num - 1; + int32_t w2_output_idx = curr_expand_token_id_index_buffer[idx]; + float* __restrict__ w2_output_iter = + w2_gemm_output_buffer + w2_output_idx * output_size_2; + float* __restrict__ ws_output_buffer_iter = ws_output_buffer; + scalar_t* __restrict__ curr_output_buffer_iter = curr_output_buffer; + vec_op::FP32Vec16 weight_vec(curr_weight[idx]); + for (int32_t i = 0; i < output_size_2; i += 16) { + vec_op::FP32Vec16 vec(w2_output_iter); + vec_op::FP32Vec16 sum(ws_output_buffer_iter); + sum = sum + vec * weight_vec; + scalar_vec_t out_vec(sum); + out_vec.save(curr_output_buffer_iter); + + // update + w2_output_iter += 16; + ws_output_buffer_iter += 16; + curr_output_buffer_iter += 16; + } + } + } else { + int32_t w2_output_idx = curr_expand_token_id_index_buffer[0]; + float* __restrict__ w2_output_iter = + w2_gemm_output_buffer + w2_output_idx * output_size_2; + scalar_t* __restrict__ curr_output_buffer_iter = curr_output_buffer; + vec_op::FP32Vec16 weight_vec(curr_weight[0]); + for (int32_t i = 0; i < output_size_2; i += 16) { + vec_op::FP32Vec16 vec(w2_output_iter); + vec = vec * weight_vec; + scalar_vec_t out_vec(vec); + out_vec.save(curr_output_buffer_iter); + + // update + w2_output_iter += 16; + curr_output_buffer_iter += 16; + } + } + } + } + } +} +} // namespace + +void prepack_moe_weight( + const torch::Tensor& weight, // [expert_num, output_size, input_size] + torch::Tensor& packed_weight, const std::string& isa) { + TORCH_CHECK(weight.is_contiguous()); + const int32_t expert_num = weight.size(0); + const int32_t output_size = weight.size(1); + const int32_t input_size = weight.size(2); + TORCH_CHECK_EQ(output_size % 32, 0); + const int64_t expert_stride = weight.stride(0); + cpu_utils::ISA isa_type = cpu_utils::get_isa(isa); + + VLLM_DISPATCH_FLOATING_TYPES( + weight.scalar_type(), "prepack_moe_weight", [&]() { + CPU_ISA_DISPATCH_IMPL(isa_type, [&]() { + scalar_t* weight_ptr = weight.data_ptr(); + scalar_t* packed_weight_ptr = packed_weight.data_ptr(); + prepack_moe_weight_impl( + weight_ptr, packed_weight_ptr, expert_num, output_size, + input_size, expert_stride); + }); + }); +} + +void cpu_fused_moe( + torch::Tensor& output, // [token_num, output_size_2] + const torch::Tensor& input, // [token_num, input_size_13] + const torch::Tensor& + w13, // [expert_num, output_size_13, input_size_13], packed + const torch::Tensor& + w2, // [expert_num, output_size_2, input_size_2], packed + const std::optional& + w13_bias, // [expert_num, output_size_13] + const std::optional& w2_bias, // [expert_num, output_size_2] + const torch::Tensor& topk_weights, // [token_num, k], float32 + const torch::Tensor& topk_id, // [token_num, k], int32 + const std::string& act, const std::string& isa) { + const int32_t token_num = input.size(0); + const int32_t input_size_13 = input.size(1); + const int64_t input_stride = input.stride(0); + TORCH_CHECK_EQ(input_stride, input_size_13); + const int32_t expert_num = w13.size(0); + const int32_t output_size_13 = w13.size(1); + const int32_t input_size_2 = w2.size(2); + const int32_t output_size_2 = w2.size(1); + const int32_t topk_num = topk_id.size(1); + const FusedMOEAct act_type = get_act_type(act); + cpu_utils::ISA isa_type = cpu_utils::get_isa(isa); + + VLLM_DISPATCH_FLOATING_TYPES(w13.scalar_type(), "cpu_fused_moe", [&]() { + CPU_ISA_DISPATCH_IMPL(isa_type, [&]() { + fused_moe_impl( + output.data_ptr(), input.data_ptr(), + w13.data_ptr(), w2.data_ptr(), + w13_bias.has_value() ? w13_bias->data_ptr() : nullptr, + w2_bias.has_value() ? w2_bias->data_ptr() : nullptr, + topk_weights.data_ptr(), topk_id.data_ptr(), act_type, + token_num, expert_num, topk_num, input_size_13, output_size_13, + input_size_2, output_size_2); + }); + }); +} diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp index 6f51277f78440..d94af338ac1c9 100644 --- a/csrc/cpu/cpu_types_x86.hpp +++ b/csrc/cpu/cpu_types_x86.hpp @@ -352,6 +352,10 @@ struct FP32Vec16 : public Vec { explicit FP32Vec16(bool, void* ptr) : reg((__m512)_mm512_stream_load_si512(ptr)) {} + // strided load + explicit FP32Vec16(const float* ptr, INT32Vec16 idx) + : reg(_mm512_i32gather_ps(idx.reg, ptr, 4)) {} + explicit FP32Vec16(__m512 data) : reg(data) {} // de-pack 4 bit values @@ -408,6 +412,10 @@ struct FP32Vec16 : public Vec { return FP32Vec16(_mm512_sub_ps(reg, b.reg)); } + FP32Vec16 operator-() const { + return FP32Vec16(_mm512_xor_ps(reg, _mm512_set1_ps(-0.0f))); + } + FP32Vec16 operator/(const FP32Vec16& b) const { return FP32Vec16(_mm512_div_ps(reg, b.reg)); } diff --git a/csrc/cpu/cpu_wna16.cpp b/csrc/cpu/cpu_wna16.cpp index 816d195506e52..88d48f3db8772 100644 --- a/csrc/cpu/cpu_wna16.cpp +++ b/csrc/cpu/cpu_wna16.cpp @@ -1,6 +1,5 @@ -#include "cpu_types.hpp" -#include "scratchpad_manager.h" -#include "utils.hpp" +#include "cpu/cpu_types.hpp" +#include "cpu/utils.hpp" #ifdef CPU_CAPABILITY_AMXBF16 #include "cpu/micro_gemm/cpu_micro_gemm_amx.hpp" @@ -158,7 +157,7 @@ void cpu_gemm_wna16_impl( // a simple schedule policy, just to hold more B tiles in L2 and make sure // each thread has tasks const int32_t n_partition_size = [&]() { - const int64_t cache_size = cpu_utils::get_l2_size(); + const int64_t cache_size = cpu_utils::get_available_l2_size(); int64_t ps_cache_limit = cache_size / (k_size * sizeof(scalar_t)); int64_t ps_thread_limit = n_size / thread_num; ps_cache_limit = @@ -179,8 +178,8 @@ void cpu_gemm_wna16_impl( const int64_t b_buffer_offset = 0; const int64_t c_buffer_offset = b_buffer_size; const int64_t buffer_size = b_buffer_size + c_buffer_size; - DNNLScratchPadManager::get_dnnl_scratchpad_manager()->realloc(buffer_size * - thread_num); + cpu_utils::ScratchPadManager::get_scratchpad_manager()->realloc(buffer_size * + thread_num); alignas(64) cpu_utils::Counter counter; cpu_utils::Counter* counter_ptr = &counter; @@ -190,9 +189,10 @@ void cpu_gemm_wna16_impl( scalar_t* __restrict__ b_buffer = nullptr; float* __restrict__ c_buffer = nullptr; { - uint8_t* buffer_ptr = DNNLScratchPadManager::get_dnnl_scratchpad_manager() - ->get_data() + - thread_id * buffer_size; + uint8_t* buffer_ptr = + cpu_utils::ScratchPadManager::get_scratchpad_manager() + ->get_data() + + thread_id * buffer_size; b_buffer = reinterpret_cast(buffer_ptr + b_buffer_offset); c_buffer = reinterpret_cast(buffer_ptr + c_buffer_offset); } diff --git a/csrc/cpu/dnnl_helper.cpp b/csrc/cpu/dnnl_helper.cpp index cfb6e78cba9a1..e337e10e1cf7b 100644 --- a/csrc/cpu/dnnl_helper.cpp +++ b/csrc/cpu/dnnl_helper.cpp @@ -4,8 +4,8 @@ #include "common/memory_desc.hpp" #include "common/memory.hpp" -#include "dnnl_helper.h" -#include "scratchpad_manager.h" +#include "cpu/utils.hpp" +#include "cpu/dnnl_helper.h" static dnnl::engine& default_engine() { static dnnl::engine engine(dnnl::engine::kind::cpu, 0); @@ -274,7 +274,7 @@ void W8A8MatMulPrimitiveHandler::execute(ExecArgs& args) { auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(5); scratchpad_storage->set_data_handle( - DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data()); + cpu_utils::ScratchPadManager::get_scratchpad_manager()->get_data()); matmul.execute(default_stream(), memory_cache_); default_stream().wait(); @@ -294,7 +294,7 @@ dnnl::matmul W8A8MatMulPrimitiveHandler::get_matmul_cache( return m_size_cache_->get_or_create(key, [&]() { dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false); - auto manager = DNNLScratchPadManager::get_dnnl_scratchpad_manager(); + auto manager = cpu_utils::ScratchPadManager::get_scratchpad_manager(); manager->realloc(desc.scratchpad_desc().get_size()); return dnnl::matmul(desc); }); @@ -470,7 +470,7 @@ void MatMulPrimitiveHandler::execute(ExecArgs& args) { auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(3); scratchpad_storage->set_data_handle( - DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data()); + cpu_utils::ScratchPadManager::get_scratchpad_manager()->get_data()); matmul.execute(default_stream(), memory_cache_); default_stream().wait(); @@ -486,7 +486,7 @@ dnnl::matmul MatMulPrimitiveHandler::get_matmul_cache( } return m_size_cache_->get_or_create(key, [&]() { dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false); - auto manager = DNNLScratchPadManager::get_dnnl_scratchpad_manager(); + auto manager = cpu_utils::ScratchPadManager::get_scratchpad_manager(); manager->realloc(desc.scratchpad_desc().get_size()); return dnnl::matmul(desc); }); diff --git a/csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp b/csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp index 87a019773a895..357c7cf1d7844 100644 --- a/csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp +++ b/csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp @@ -235,6 +235,39 @@ class MicroGemm { } } + static void pack_weight(const scalar_t* __restrict__ weight, + scalar_t* __restrict__ packed_weight, + const int32_t output_size, const int32_t input_size) { + constexpr int32_t elem_num_per_group = 4 / sizeof(scalar_t); + TORCH_CHECK_EQ(output_size % 16, 0); + TORCH_CHECK_EQ(input_size % (16 * elem_num_per_group), 0); + + const int32_t output_group_num = output_size / 16; + const int32_t input_32b_num = input_size / elem_num_per_group; + for (int32_t output_group_idx = 0; output_group_idx < output_group_num; + ++output_group_idx) { + const int32_t* __restrict__ weight_32b = + reinterpret_cast(weight); + int32_t* __restrict__ packed_weight_32b = + reinterpret_cast(packed_weight); + for (int32_t output_idx = 0; output_idx < 16; ++output_idx) { + for (int32_t weight_offset = 0, packed_offset = 0; + weight_offset < input_32b_num; + ++weight_offset, packed_offset += 16) { + packed_weight_32b[packed_offset] = weight_32b[weight_offset]; + } + + // update + weight_32b += input_32b_num; + packed_weight_32b += 1; + } + + // update + weight += 16 * input_size; + packed_weight += 16 * input_size; + } + } + private: alignas(64) __tilecfg amx_tile_config_; int32_t curr_m_; diff --git a/csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp b/csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp index 784da55a420e5..23e78a681b5fe 100644 --- a/csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp +++ b/csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp @@ -13,6 +13,9 @@ namespace cpu_micro_gemm { #define CPU_MICRO_GEMM_PARAMS \ a_ptr, b_ptr, c_ptr, m, k, lda, b_n_group_stride, ldc, accum_c +// Note: weights for MicroGemm should be packed as (output_size / 16) contiguous +// blocks, means the logical shape of blocks is [16, input_size]. And the actual +// layout of blocks can be ISA-specific. template class MicroGemm { public: @@ -86,6 +89,41 @@ FORCE_INLINE void bias_epilogue(float* __restrict__ c_ptr, curr_d += ldd; } } + +template +FORCE_INLINE void add_bias_epilogue(float* c_ptr, float* d_ptr, + scalar_t* __restrict__ bias_ptr, + const int32_t m, const int64_t ldc, + const int64_t ldd) { + using scalar_vec_t = typename cpu_utils::VecTypeTrait::vec_t; + static_assert(n_size % 16 == 0); + constexpr int32_t n_group_num = n_size / 16; + static_assert(n_group_num <= 16); + + vec_op::FP32Vec16 bias_vecs[n_group_num]; + scalar_t* __restrict__ curr_bias = bias_ptr; + vec_op::unroll_loop([&](int32_t i) { + scalar_vec_t vec(curr_bias); + bias_vecs[i] = vec_op::FP32Vec16(vec); + curr_bias += 16; + }); + + float* curr_c = c_ptr; + float* curr_d = d_ptr; + for (int32_t i = 0; i < m; ++i) { + float* curr_c_iter = curr_c; + float* curr_d_iter = curr_d; + vec_op::unroll_loop([&](int32_t n_g_idx) { + vec_op::FP32Vec16 c_vec_fp32(curr_c_iter); + c_vec_fp32 = c_vec_fp32 + bias_vecs[n_g_idx]; + c_vec_fp32.save(curr_d_iter); + curr_c_iter += 16; + curr_d_iter += 16; + }); + curr_c += ldc; + curr_d += ldd; + } +} } // namespace cpu_micro_gemm #endif diff --git a/csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp b/csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp index 3985c2f2e5fe4..bdd3e85a1c522 100644 --- a/csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp +++ b/csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp @@ -109,6 +109,25 @@ class MicroGemm { void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) { TileGemm82::gemm(CPU_MICRO_GEMM_PARAMS); } + + // Note: pack contiguous weight [output_size, input_size] as contiguous + // packed weight [output_size / 16, input_size, 16] + static void pack_weight(const scalar_t* __restrict__ weight, + scalar_t* __restrict__ packed_weight, + const int32_t output_size, const int32_t input_size) { + TORCH_CHECK_EQ(output_size % 16, 0); + for (int32_t o_idx = 0; o_idx < output_size; ++o_idx) { + const scalar_t* __restrict__ curr_weight = weight + o_idx * input_size; + scalar_t* __restrict__ curr_packed_weight = + packed_weight + (o_idx / 16) * (16 * input_size) + o_idx % 16; + for (int32_t i_idx = 0; i_idx < input_size; ++i_idx) { + *curr_packed_weight = *curr_weight; + + curr_packed_weight += 16; + ++curr_weight; + } + } + } }; } // namespace cpu_micro_gemm diff --git a/csrc/cpu/scratchpad_manager.cpp b/csrc/cpu/scratchpad_manager.cpp deleted file mode 100644 index 05cd435f34b7a..0000000000000 --- a/csrc/cpu/scratchpad_manager.cpp +++ /dev/null @@ -1,23 +0,0 @@ -#include - -#include "scratchpad_manager.h" - -DNNLScratchPadManager::DNNLScratchPadManager() : size_(0), ptr_(nullptr) { - this->realloc(allocation_unit * 128); -} - -void DNNLScratchPadManager::realloc(size_t new_size) { - new_size = round(new_size); - if (new_size > size_) { - if (ptr_ != nullptr) { - std::free(ptr_); - } - ptr_ = std::aligned_alloc(64, new_size); - size_ = new_size; - } -} - -DNNLScratchPadManager* DNNLScratchPadManager::get_dnnl_scratchpad_manager() { - static DNNLScratchPadManager manager; - return &manager; -} diff --git a/csrc/cpu/scratchpad_manager.h b/csrc/cpu/scratchpad_manager.h deleted file mode 100644 index 0ecf59192f845..0000000000000 --- a/csrc/cpu/scratchpad_manager.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef SCRATCHPAD_MANAGER_H -#define SCRATCHPAD_MANAGER_H - -#include -#include - -class DNNLScratchPadManager { - public: - static constexpr size_t allocation_unit = 4 * 1024; // 4KB - - static DNNLScratchPadManager* get_dnnl_scratchpad_manager(); - - DNNLScratchPadManager(); - - template - T* get_data() { - return reinterpret_cast(ptr_); - } - - static size_t round(size_t size) { - return ((size + allocation_unit - 1) / allocation_unit) * allocation_unit; - } - - void realloc(size_t new_size); - - private: - size_t size_; - void* ptr_; -}; - -#endif diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp index e0e3ef71b485f..dd419405c94b9 100644 --- a/csrc/cpu/torch_bindings.cpp +++ b/csrc/cpu/torch_bindings.cpp @@ -110,6 +110,17 @@ void cpu_gemm_wna16(const torch::Tensor& input, const torch::Tensor& q_weight, const std::optional& bias, const int64_t pack_factor, const std::string& isa_hint); +void prepack_moe_weight(const torch::Tensor& weight, + torch::Tensor& packed_weight, const std::string& isa); + +void cpu_fused_moe(torch::Tensor& output, const torch::Tensor& input, + const torch::Tensor& w13, const torch::Tensor& w2, + const std::optional& w13_bias, + const std::optional& w2_bias, + const torch::Tensor& topk_weights, + const torch::Tensor& topk_id, const std::string& act, + const std::string& isa); + TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // vLLM custom ops @@ -296,6 +307,19 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "pack_factor, str isa_hint) -> ()"); ops.impl("cpu_gemm_wna16", torch::kCPU, &cpu_gemm_wna16); #endif + + // fused moe +#if defined(__AVX512F__) + ops.def( + "prepack_moe_weight(Tensor weight, Tensor(a1!) packed_weight, str isa) " + "-> ()"); + ops.impl("prepack_moe_weight", torch::kCPU, &prepack_moe_weight); + ops.def( + "cpu_fused_moe(Tensor(a0!) output, Tensor input, Tensor w13, Tensor w2, " + "Tensor? w13_bias, Tensor? w2_bias, Tensor topk_weights, Tensor topk_id, " + "str act, str isa) -> ()"); + ops.impl("cpu_fused_moe", torch::kCPU, &cpu_fused_moe); +#endif } TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) { diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp index 3dacfc7b2b7a3..fcd7534ab4c5d 100644 --- a/csrc/cpu/utils.cpp +++ b/csrc/cpu/utils.cpp @@ -10,7 +10,7 @@ #define gettid() syscall(SYS_gettid) #endif -#include "cpu_types.hpp" +#include "cpu/utils.hpp" #ifdef VLLM_NUMA_DISABLED std::string init_cpu_threads_env(const std::string& cpu_ids) { @@ -138,4 +138,26 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) { return ss.str(); } + +namespace cpu_utils { +ScratchPadManager::ScratchPadManager() : size_(0), ptr_(nullptr) { + this->realloc(allocation_unit * 128); +} + +void ScratchPadManager::realloc(size_t new_size) { + new_size = round(new_size); + if (new_size > size_) { + if (ptr_ != nullptr) { + std::free(ptr_); + } + ptr_ = std::aligned_alloc(64, new_size); + size_ = new_size; + } +} + +ScratchPadManager* ScratchPadManager::get_scratchpad_manager() { + static ScratchPadManager manager; + return &manager; +} +} // namespace cpu_utils #endif diff --git a/csrc/cpu/utils.hpp b/csrc/cpu/utils.hpp index d3def306b8069..8ab0bb039c014 100644 --- a/csrc/cpu/utils.hpp +++ b/csrc/cpu/utils.hpp @@ -2,19 +2,24 @@ #define UTILS_HPP #include -#include -#include #include +#include -#if defined(__APPLE__) - #include -#endif - -#include "cpu_types.hpp" +#include "cpu/cpu_types.hpp" namespace cpu_utils { enum class ISA { AMX, VEC }; +inline ISA get_isa(const std::string& isa) { + if (isa == "amx") { + return ISA::AMX; + } else if (isa == "vec") { + return ISA::VEC; + } else { + TORCH_CHECK(false, "Invalid isa type: " + isa); + } +} + template struct VecTypeTrait { using vec_t = void; @@ -48,26 +53,66 @@ struct Counter { int64_t acquire_counter() { return counter++; } }; -inline int64_t get_l2_size() { +inline int64_t get_available_l2_size() { static int64_t size = []() { -#if defined(__APPLE__) - // macOS doesn't have _SC_LEVEL2_CACHE_SIZE. Use sysctlbyname. - int64_t l2_cache_size = 0; - size_t len = sizeof(l2_cache_size); - if (sysctlbyname("hw.l2cachesize", &l2_cache_size, &len, NULL, 0) == 0 && - l2_cache_size > 0) { - return l2_cache_size >> 1; // use 50% of L2 cache - } - // Fallback if sysctlbyname fails - return 128LL * 1024 >> 1; // use 50% of 128KB -#else - long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE); - assert(l2_cache_size != -1); + const uint32_t l2_cache_size = at::cpu::L2_cache_size(); return l2_cache_size >> 1; // use 50% of L2 cache -#endif }(); return size; } + +template +inline T round_up(T size) { + T alignment = alignment_v; + return (((size + alignment - 1) / alignment) * alignment); +} + +template +inline T round_down(T size) { + T alignment = alignment_v; + return (size / alignment) * alignment; +} + +template +inline void print_logits(const char* name, T* ptr, int32_t row, int32_t col, + int32_t stride) { + std::stringstream ss; + ss << std::fixed << std::setprecision(5) << name << ": [\n"; + auto* curr_logits_buffer = ptr; + for (int32_t m = 0; m < row; ++m) { + for (int32_t n = 0; n < col; ++n) { + ss << curr_logits_buffer[n] << ", "; + } + ss << "\n"; + curr_logits_buffer += stride; + } + ss << "]\n"; + std::printf("%s", ss.str().c_str()); +} + +class ScratchPadManager { + public: + static constexpr size_t allocation_unit = 4 * 1024; // 4KB + + static ScratchPadManager* get_scratchpad_manager(); + + ScratchPadManager(); + + template + T* get_data() { + return reinterpret_cast(ptr_); + } + + static size_t round(size_t size) { + return ((size + allocation_unit - 1) / allocation_unit) * allocation_unit; + } + + void realloc(size_t new_size); + + private: + size_t size_; + void* ptr_; +}; } // namespace cpu_utils #endif diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index bd5bc43916eac..2caf1ad144178 100644 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -147,7 +147,9 @@ WORKDIR /workspace/vllm RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ --mount=type=cache,target=/var/lib/apt,sharing=locked \ - apt-get install -y --no-install-recommends vim numactl xz-utils + apt-get install -y --no-install-recommends vim numactl xz-utils make clangd-14 + +RUN ln -s /usr/bin/clangd-14 /usr/bin/clangd # install development dependencies (for testing) RUN --mount=type=cache,target=/root/.cache/uv \ diff --git a/requirements/cpu-build.txt b/requirements/cpu-build.txt index 1ea401a04a12c..a7bd3b17b6323 100644 --- a/requirements/cpu-build.txt +++ b/requirements/cpu-build.txt @@ -1,7 +1,7 @@ cmake>=3.26.1 ninja packaging>=24.2 -setuptools>=77.0.3,<81.0.0 +setuptools==77.0.3 # this version can reuse CMake build dir setuptools-scm>=8 torch==2.9.1+cpu; platform_machine == "x86_64" or platform_machine == "s390x" torch==2.9.1; platform_system == "Darwin" or platform_machine == "ppc64le" or platform_machine == "aarch64" diff --git a/requirements/cpu.txt b/requirements/cpu.txt index 7a670812e8943..111b8a5511562 100644 --- a/requirements/cpu.txt +++ b/requirements/cpu.txt @@ -1,6 +1,8 @@ # Common dependencies -r common.txt +setuptools==77.0.3 # this version can reuse CMake build dir + numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative decoding # Dependencies for CPUs diff --git a/tests/kernels/moe/test_cpu_fused_moe.py b/tests/kernels/moe/test_cpu_fused_moe.py new file mode 100644 index 0000000000000..4dda45a6c7409 --- /dev/null +++ b/tests/kernels/moe/test_cpu_fused_moe.py @@ -0,0 +1,172 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest +import torch + +from tests.kernels.allclose_default import get_default_atol, get_default_rtol +from vllm._custom_ops import cpu_fused_moe, cpu_prepack_moe_weight +from vllm.model_executor.layers.activation import SiluAndMul, SwigluOAIAndMul +from vllm.platforms import current_platform + +if not current_platform.is_cpu(): + pytest.skip("skipping CPU-only tests", allow_module_level=True) + +EXPERT_NUM = [ + 8, +] +HIDDEN_DIM = [128, 2880] +INTERMEDIATE_DIM = [128, 2880] +BATCH_SIZE = [1, 64, 256] +ACT = ["silu", "swigluoai"] +USE_BIAS = [True, False] +ISA = ["amx", "vec"] if torch._C._cpu._is_amx_tile_supported() else ["vec"] +DTYPE = [torch.bfloat16] + +_CPU_MOE_ACT = { + "silu": SiluAndMul(), + "swigluoai": SwigluOAIAndMul(), +} + + +def ref_fused_moe( + input: torch.Tensor, + w13: torch.Tensor, + w2: torch.Tensor, + w13_bias: torch.Tensor | None, + w2_bias: torch.Tensor | None, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, +) -> torch.Tensor: + len_experts = w13.size(0) + + cnts = topk_ids.new_zeros((topk_ids.shape[0], len_experts)) + cnts.scatter_(1, topk_ids.to(torch.int64), 1) + tokens_per_expert = cnts.sum(dim=0) + idxs = topk_ids.view(-1).argsort() + + sorted_tokens = input[idxs // topk_ids.shape[1]] + tokens_per_expert = tokens_per_expert.cpu().numpy() + + outputs = [] + start_idx = 0 + + for i, num_tokens in enumerate(tokens_per_expert): + end_idx = start_idx + num_tokens + if num_tokens == 0: + continue + tokens_for_this_expert = sorted_tokens[start_idx:end_idx].float() + curr_w13 = w13[i].float() + curr_w2 = w2[i].float() + + curr_w13_bias = None + if w13_bias is not None: + curr_w13_bias = w13_bias[i].float() + + curr_w2_bias = None + if w2_bias is not None: + curr_w2_bias = w2_bias[i].float() + + gate_up = torch.nn.functional.linear( + tokens_for_this_expert, curr_w13, curr_w13_bias + ) + # Note: to simulate the kernel implementation + gate_up = ( + _CPU_MOE_ACT[activation] + .forward_native(gate_up) + .to(dtype=input.dtype) + .float() + ) + expert_out = torch.nn.functional.linear(gate_up, curr_w2, curr_w2_bias) + + outputs.append(expert_out) + start_idx = end_idx + + outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0) + new_x = torch.empty_like(outs) + + new_x[idxs] = outs + final_out = ( + new_x.view(*topk_ids.shape, -1) + .mul_(topk_weights.unsqueeze(dim=-1)) + .sum(dim=1) + .type(input.dtype) + ) + return final_out + + +@pytest.mark.parametrize("batch_size", BATCH_SIZE) +@pytest.mark.parametrize("expert_num", EXPERT_NUM) +@pytest.mark.parametrize("hidden_size", HIDDEN_DIM) +@pytest.mark.parametrize("intermediate_size", INTERMEDIATE_DIM) +@pytest.mark.parametrize("use_bias", USE_BIAS) +@pytest.mark.parametrize("dtype", DTYPE) +@pytest.mark.parametrize("act", ACT) +@pytest.mark.parametrize("isa", ISA) +def test_cpu_fused_moe( + batch_size: int, + expert_num: int, + hidden_size: int, + intermediate_size: int, + use_bias: bool, + dtype: torch.dtype, + act: str, + isa: str, +): + current_platform.seed_everything(0) + + topk_num = max(expert_num // 2, 1) + up_dim = 2 * intermediate_size + + input = torch.randn((batch_size, hidden_size), dtype=dtype) / ( + 0.5 * hidden_size**0.5 + ) + w13 = torch.randn((expert_num, up_dim, hidden_size), dtype=dtype) / ( + 0.5 * hidden_size**0.5 + ) + w2 = torch.randn((expert_num, hidden_size, intermediate_size), dtype=dtype) / ( + 0.5 * intermediate_size**0.5 + ) + router_logits = torch.randn((batch_size, expert_num), dtype=dtype) + w13_bias = None + w2_bias = None + if use_bias: + w13_bias = torch.randn((expert_num, up_dim), dtype=dtype) / (0.5 * up_dim**0.5) + w2_bias = torch.randn((expert_num, hidden_size), dtype=dtype) / ( + 0.5 * hidden_size**0.5 + ) + score = torch.softmax(router_logits, dim=-1, dtype=torch.float32) + topk_weight, topk_ids = torch.topk(score, topk_num) + topk_ids = topk_ids.to(torch.int32) + + ref_output = ref_fused_moe( + input, + w13, + w2, + w13_bias, + w2_bias, + topk_weight, + topk_ids, + act, + ) + + packed_w13 = cpu_prepack_moe_weight(w13, isa) + packed_w2 = cpu_prepack_moe_weight(w2, isa) + output = cpu_fused_moe( + input, + packed_w13, + packed_w2, + w13_bias, + w2_bias, + topk_weight, + topk_ids, + act, + isa, + ) + + atol, rtol = get_default_atol(output), get_default_rtol(output) + ( + torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol), + f"{torch.max(torch.abs(output - ref_output))}", + ) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 2319655008c50..cf7f17a033be3 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -2919,6 +2919,42 @@ def cpu_gemm_wna16( return output +def cpu_prepack_moe_weight( + weight: torch.Tensor, + isa: str, +) -> torch.Tensor: + output = torch.empty_like(weight) + torch.ops._C.prepack_moe_weight(weight, output, isa) + return output + + +def cpu_fused_moe( + input: torch.Tensor, + w13: torch.Tensor, + w2: torch.Tensor, + w13_bias: torch.Tensor | None, + w2_bias: torch.Tensor | None, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + act: str, + isa: str, +) -> torch.Tensor: + output = torch.empty_like(input) + torch.ops._C.cpu_fused_moe( + output, + input, + w13, + w2, + w13_bias, + w2_bias, + topk_weights, + topk_ids, + act, + isa, + ) + return output + + if hasattr(torch.ops._qutlass_C, "matmul_mxf4_bf16_tn"): @register_fake("_qutlass_C::matmul_mxf4_bf16_tn") diff --git a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py index 659a2d4ee5b39..cf7a4313de24c 100644 --- a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py @@ -1,12 +1,22 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import weakref from collections.abc import Callable import torch from torch.nn import functional as F from vllm import _custom_ops as ops +from vllm._custom_ops import cpu_fused_moe, cpu_prepack_moe_weight from vllm.model_executor.layers.activation import SiluAndMul, SwigluOAIAndMul +from vllm.model_executor.layers.quantization.utils.layer_utils import replace_parameter +from vllm.utils.torch_utils import direct_register_custom_op + +_CPU_MOE_LAYER_CACHE = {} +_CPU_MOE_ACT = { + "silu": SiluAndMul(), + "swigluoai": SwigluOAIAndMul(), +} def grouped_topk( @@ -174,8 +184,105 @@ class SGLFusedMOE: class CPUFusedMOE: def __init__(self, layer: torch.nn.Module) -> None: - use_onednn_mm = ops._supports_onednn and ops.is_onednn_acl_supported() + use_grouped_gemm, isa = self.check_grouped_gemm(layer) + self.isa = isa + if use_grouped_gemm: + self.forward_method = self.forward_grouped_gemm + self.init_moe_grouped_gemm(layer=layer) + else: + self.forward_method = self.forward_torch + self.init_moe_torch(layer=layer) + def __call__( + self, + layer: torch.nn.Module, + x: torch.Tensor, + use_grouped_topk: bool, + top_k: int, + router_logits: torch.Tensor, + renormalize: bool, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + ) -> torch.Tensor: + assert activation in _CPU_MOE_ACT, f"{activation} is not supported." + assert not apply_router_weight_on_input + + topk_weights, topk_ids = select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias, + ) + + return self.forward_method( + layer, + x, + topk_weights, + topk_ids, + activation, + global_num_experts, + ) + + def check_grouped_gemm( + self, + layer: torch.nn.Module, + ) -> tuple[bool, str]: + if not hasattr(torch.ops._C, "prepack_moe_weight"): + return False, "none" + + dtype = layer.w13_weight.dtype + w13_input_size = layer.w13_weight.size(2) + w13_output_size = layer.w13_weight.size(1) + w2_input_size = layer.w2_weight.size(2) + w2_output_size = layer.w2_weight.size(1) + + if not (w13_output_size % 32 == 0 and w2_output_size % 32 == 0): + return False, "none" + + supports_amx = torch._C._cpu._is_amx_tile_supported() + + if ( + supports_amx + and dtype == torch.bfloat16 + and w13_input_size % 32 == 0 + and w2_input_size % 32 == 0 + ): + return True, "amx" + + if supports_amx: + return False, "none" + + return True, "vec" + + def init_moe_grouped_gemm( + self, + layer: torch.nn.Module, + ) -> None: + new_w13 = cpu_prepack_moe_weight(layer.w13_weight, self.isa) + replace_parameter(layer, "w13_weight", new_w13) + new_w2 = cpu_prepack_moe_weight(layer.w2_weight, self.isa) + replace_parameter(layer, "w2_weight", new_w2) + + def init_moe_torch( + self, + layer: torch.nn.Module, + ) -> None: + use_onednn_mm = ops._supports_onednn and ops.is_onednn_acl_supported() num_experts = layer.w13_weight.size(0) has_w13_bias = hasattr(layer, "w13_bias") has_w2_bias = hasattr(layer, "w2_bias") @@ -208,85 +315,112 @@ class CPUFusedMOE: layer.down_linear.append( lambda x, w=layer_w2_weight, b=layer_w2_bias: F.linear(x, w, b) ) + if use_onednn_mm: # remove weight layer.w13_weight = torch.nn.Parameter(torch.empty(0), requires_grad=False) layer.w2_weight = torch.nn.Parameter(torch.empty(0), requires_grad=False) - self.act_to_impl = { - "silu": SiluAndMul(), - "swigluoai": SwigluOAIAndMul(), - } + _CPU_MOE_LAYER_CACHE[id(layer)] = weakref.ref(layer) - def __call__( + def forward_grouped_gemm( self, layer: torch.nn.Module, - x: torch.Tensor, - use_grouped_topk: bool, - top_k: int, - router_logits: torch.Tensor, - renormalize: bool, - topk_group: int | None = None, - num_expert_group: int | None = None, + input: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", ) -> torch.Tensor: - assert activation in self.act_to_impl, f"{activation} is not supported." - assert not apply_router_weight_on_input - topk_weights, topk_ids = select_experts( - hidden_states=x, - router_logits=router_logits, - use_grouped_topk=use_grouped_topk, - top_k=top_k, - renormalize=renormalize, - topk_group=topk_group, - num_expert_group=num_expert_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - routed_scaling_factor=routed_scaling_factor, - e_score_correction_bias=e_score_correction_bias, + output = cpu_fused_moe( + input, + layer.w13_weight, + layer.w2_weight, + getattr(layer, "w13_bias", None), + getattr(layer, "w2_bias", None), + topk_weights, + topk_ids, + activation, + self.isa, + ) + return output + + def forward_torch( + self, + layer: torch.nn.Module, + input: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int = -1, + ) -> torch.Tensor: + output = torch.empty_like(input) + layer_id = id(layer) + torch.ops.vllm.cpu_fused_moe_torch( + layer_id, + output, + input, + topk_weights, + topk_ids, + activation, + global_num_experts, ) - # Ref code from https://github.com/sgl-project/sglang/blob/716e682721397df103f347d22da8bd46c6016dab/python/sglang/srt/layers/moe/fused_moe_native.py#L53 - len_experts = global_num_experts + return output - cnts = topk_ids.new_zeros((topk_ids.shape[0], len_experts)) - cnts.scatter_(1, topk_ids.to(torch.int64), 1) - tokens_per_expert = cnts.sum(dim=0) - idxs = topk_ids.view(-1).argsort() - sorted_tokens = x[idxs // topk_ids.shape[1]] - tokens_per_expert = tokens_per_expert.cpu().numpy() +def cpu_fused_moe_torch( + layer_id: int, + output: torch.Tensor, + input: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int = -1, +) -> None: + layer = _CPU_MOE_LAYER_CACHE[layer_id]() - outputs = [] - start_idx = 0 + # Ref code from https://github.com/sgl-project/sglang/blob/716e682721397df103f347d22da8bd46c6016dab/python/sglang/srt/layers/moe/fused_moe_native.py#L53 + len_experts = global_num_experts - for i, num_tokens in enumerate(tokens_per_expert): - end_idx = start_idx + num_tokens - if num_tokens == 0: - continue - tokens_for_this_expert = sorted_tokens[start_idx:end_idx] + cnts = topk_ids.new_zeros((topk_ids.shape[0], len_experts)) + cnts.scatter_(1, topk_ids.to(torch.int64), 1) + tokens_per_expert = cnts.sum(dim=0) + idxs = topk_ids.view(-1).argsort() - gate_up = layer.gate_up_linear[i](tokens_for_this_expert) - gate_up = self.act_to_impl[activation].forward_native(gate_up) - expert_out = layer.down_linear[i](gate_up) - outputs.append(expert_out) - start_idx = end_idx + sorted_tokens = input[idxs // topk_ids.shape[1]] + tokens_per_expert = tokens_per_expert.cpu().numpy() - outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0) - new_x = torch.empty_like(outs) + outputs = [] + start_idx = 0 - new_x[idxs] = outs - final_out = ( - new_x.view(*topk_ids.shape, -1) - .type(topk_weights.dtype) - .mul_(topk_weights.unsqueeze(dim=-1)) - .sum(dim=1) - .type(new_x.dtype) - ) - return final_out + for i, num_tokens in enumerate(tokens_per_expert): + end_idx = start_idx + num_tokens + if num_tokens == 0: + continue + tokens_for_this_expert = sorted_tokens[start_idx:end_idx] + + gate_up = layer.gate_up_linear[i](tokens_for_this_expert) # type: ignore + gate_up = _CPU_MOE_ACT[activation].forward_native(gate_up) + expert_out = layer.down_linear[i](gate_up) # type: ignore + outputs.append(expert_out) + start_idx = end_idx + + outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0) + new_x = torch.empty_like(outs) + + new_x[idxs] = outs + final_out = ( + new_x.view(*topk_ids.shape, -1) + .type(topk_weights.dtype) + .mul_(topk_weights.unsqueeze(dim=-1)) + .sum(dim=1) + .type(new_x.dtype) + ) + output.copy_(final_out) + + +direct_register_custom_op( + op_name="cpu_fused_moe_torch", + op_func=cpu_fused_moe_torch, + mutates_args=["output"], +) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index db97d6eb88ea5..6a65b06014bca 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1726,9 +1726,10 @@ class FusedMoE(CustomOp): return states if self.shared_experts is None: - if current_platform.is_tpu(): + if current_platform.is_tpu() or current_platform.is_cpu(): # TODO: Once the OOM issue for the TPU backend is resolved, we # will switch to using the moe_forward custom op. + # Note: CPU doesn't require wrapped forward_impl. fused_output = self.forward_impl(hidden_states, router_logits) assert not isinstance(fused_output, tuple) else: @@ -1744,9 +1745,10 @@ class FusedMoE(CustomOp): else: return reduce_output(fused_output)[..., :og_hidden_states] else: - if current_platform.is_tpu(): + if current_platform.is_tpu() or current_platform.is_cpu(): # TODO: Once the OOM issue for the TPU backend is resolved, we # will switch to using the moe_forward custom op. + # Note: CPU doesn't require wrapped forward_impl. shared_output, fused_output = self.forward_impl( hidden_states, router_logits ) From 11a89cf95caaec8dec13fab1e8e3d64c9a852a08 Mon Sep 17 00:00:00 2001 From: Yifan Qiao Date: Wed, 17 Dec 2025 22:42:21 -0800 Subject: [PATCH 052/176] [Fix][FlexAttention] return max logical block index to handle reused blocks (#30915) Signed-off-by: Yifan Qiao --- tests/kernels/test_flex_attention.py | 31 +++++++++++++++++++- vllm/v1/attention/backends/flex_attention.py | 15 ++++++++-- 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/tests/kernels/test_flex_attention.py b/tests/kernels/test_flex_attention.py index f6987d54399d2..7053a8697e190 100644 --- a/tests/kernels/test_flex_attention.py +++ b/tests/kernels/test_flex_attention.py @@ -15,7 +15,10 @@ from tests.v1.attention.utils import ( create_standard_kv_cache_spec, create_vllm_config, ) -from vllm.v1.attention.backends.flex_attention import FlexAttentionMetadataBuilder +from vllm.v1.attention.backends.flex_attention import ( + FlexAttentionMetadataBuilder, + physical_to_logical_mapping, +) from ..models.utils import check_embeddings_close, check_logprobs_close @@ -205,5 +208,31 @@ def test_block_mask_direct_vs_slow_path(): ) +def test_physical_to_logical_mapping_handles_reused_blocks(): + """Regression test: reused physical blocks map to the latest logical block. + + For sliding-window / hybrid attention layers, physical KV-cache blocks can be + reused over time. The inverse mapping must therefore select the latest + logical block index for a physical block id. + """ + # Padding should not make physical block 0 look live. + block_table = torch.tensor([[6, 0, 0, 0]], dtype=torch.int32) + seq_lens = torch.tensor([1 * 16], dtype=torch.int32) # only 1 block valid + out = physical_to_logical_mapping( + block_table=block_table, seq_lens=seq_lens, block_size=16, total_blocks=10 + ) + assert out[0, 0].item() == -1 + assert out[0, 6].item() == 0 + + # If a physical block id appears multiple times (block reuse), mapping should + # point to the latest logical block index. + block_table2 = torch.tensor([[2, 2, 5]], dtype=torch.int32) + seq_lens2 = torch.tensor([3 * 16], dtype=torch.int32) + out2 = physical_to_logical_mapping( + block_table=block_table2, seq_lens=seq_lens2, block_size=16, total_blocks=8 + ) + assert out2[0, 2].item() == 1 + + if __name__ == "__main__": pytest.main([__file__]) diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py index d8dbe4cbae013..8193c05c2b1ab 100644 --- a/vllm/v1/attention/backends/flex_attention.py +++ b/vllm/v1/attention/backends/flex_attention.py @@ -160,7 +160,7 @@ def physical_to_logical_mapping( └───────────────────────────────────────────┘ If multiple logical blocks map to the same physical block, - this function returns the first (minimum) logical block index. + this function returns the latest (maximum) logical block index. If a physical block is not mapped to by any logical block, its value in the result will be -1. @@ -183,6 +183,15 @@ def physical_to_logical_mapping( To prevent this, we use seq_lens and block_size to mask out unused entries, ensuring only valid block references are processed. + IMPORTANT: Reused physical blocks (sliding-window / hybrid attention) + ──────────────────────────────────────────────────────────────────── + For some attention types, physical cache blocks can be reused over time. + This can cause the same physical block id to appear multiple times in a row + of `block_table` at different logical block indices. In that case, only the + latest logical block index corresponds to the current contents of that + physical block. Therefore, the inverse mapping must pick the maximum logical + block index for each physical block id. + Args: block_table: Tensor of shape [max_reqs, max_num_blocks] mapping logical blocks to physical locations. May contain @@ -217,8 +226,8 @@ def physical_to_logical_mapping( mask, torch.arange(max_num_blocks, device=device)[None, :], 0 ) - physical_to_logical.scatter_( - -1, valid_block_table.to(torch.int64), valid_logical_indices + physical_to_logical.scatter_reduce_( + -1, valid_block_table.to(torch.int64), valid_logical_indices, reduce="amax" ) # NB - Seems like block 0 is always empty so we reset it manually physical_to_logical[:, 0] = -1 From a85724bd6e51af58793da365da7b70b1b72d71b9 Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Thu, 18 Dec 2025 14:45:29 +0800 Subject: [PATCH 053/176] [Platform] Let EPD work with non-cuda platform (#30225) Signed-off-by: wangxiyuan --- .../ec_transfer/ec_connector/example_connector.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/distributed/ec_transfer/ec_connector/example_connector.py b/vllm/distributed/ec_transfer/ec_connector/example_connector.py index c9aad9e9fc8f3..3518044ce2e00 100644 --- a/vllm/distributed/ec_transfer/ec_connector/example_connector.py +++ b/vllm/distributed/ec_transfer/ec_connector/example_connector.py @@ -73,6 +73,7 @@ class ECExampleConnector(ECConnectorBase): data hashes (`mm_hash`) to encoder cache tensors. kwargs (dict): Additional keyword arguments for the connector. """ + from vllm.platforms import current_platform # Get the metadata metadata: ECConnectorMetadata = self._get_connector_metadata() @@ -91,7 +92,9 @@ class ECExampleConnector(ECConnectorBase): if mm_data.mm_hash in encoder_cache: continue filename = self._generate_filename_debug(mm_data.mm_hash) - ec_cache = safetensors.torch.load_file(filename)["ec_cache"].cuda() + ec_cache = safetensors.torch.load_file( + filename, device=current_platform.device_type + )["ec_cache"] encoder_cache[mm_data.mm_hash] = ec_cache logger.debug("Success load encoder cache for hash %s", mm_data.mm_hash) From be2ad5f92060b66788740e2e8302a490fbd226f4 Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Thu, 18 Dec 2025 01:04:57 -0600 Subject: [PATCH 054/176] [ROCm][Bugfix] fix(structured_output): Skip guidance backend for schemas with patternProperties (#30730) Signed-off-by: Andreas Karatzas --- vllm/v1/engine/input_processor.py | 21 +++++++++++++-- vllm/v1/structured_output/backend_guidance.py | 26 +++++++++++++++++++ 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index 65e0c845b0afa..29293877cb69d 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -24,7 +24,10 @@ from vllm.tokenizers.mistral import MistralTokenizer from vllm.utils import length_from_prompt_token_ids_or_embeds from vllm.v1.engine import EngineCoreRequest from vllm.v1.metrics.stats import MultiModalCacheStats -from vllm.v1.structured_output.backend_guidance import validate_guidance_grammar +from vllm.v1.structured_output.backend_guidance import ( + has_guidance_unsupported_json_features, + validate_guidance_grammar, +) from vllm.v1.structured_output.backend_lm_format_enforcer import ( validate_structured_output_request_lm_format_enforcer, ) @@ -340,8 +343,22 @@ class InputProcessor: # The request either failed validation # or includes some jsonschema feature(s) that # are not supported in xgrammar. - if isinstance(self.tokenizer, MistralTokenizer): + + # Check if schema has features unsupported by guidance + so_params = params.structured_outputs + skip_guidance = False + if so_params.json: + if isinstance(so_params.json, str): + import json + + schema = json.loads(so_params.json) + else: + schema = so_params.json + skip_guidance = has_guidance_unsupported_json_features(schema) + + if isinstance(self.tokenizer, MistralTokenizer) or skip_guidance: # Fall back to outlines if the tokenizer is Mistral + # or if schema contains features unsupported by guidance validate_structured_output_request_outlines(params) params.structured_outputs._backend = "outlines" else: diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py index 2962a439dcb3e..727a67333bd71 100644 --- a/vllm/v1/structured_output/backend_guidance.py +++ b/vllm/v1/structured_output/backend_guidance.py @@ -44,6 +44,32 @@ def _walk_json_for_additional_properties(data: object): _walk_json_for_additional_properties(item) +def has_guidance_unsupported_json_features(schema: dict[str, Any]) -> bool: + """Check if JSON schema contains features unsupported by guidance/llguidance.""" + + def check_object(obj: dict[str, Any]) -> bool: + if not isinstance(obj, dict): + return False + + # patternProperties is not supported by llguidance + if "patternProperties" in obj: + return True + + # Recursively check all nested objects and arrays + for value in obj.values(): + if isinstance(value, dict): + if check_object(value): + return True + elif isinstance(value, list): + for item in value: + if isinstance(item, dict) and check_object(item): + return True + + return False + + return check_object(schema) + + def process_for_additional_properties( guide_json: str | dict[str, Any], ) -> dict[str, Any]: From aa7e8360559e639f201f08a4deee490af332b22c Mon Sep 17 00:00:00 2001 From: Chauncey Date: Thu, 18 Dec 2025 15:12:17 +0800 Subject: [PATCH 055/176] [Bugfix] Fix Unicode issues in GLM-4 tool calling (#30920) Signed-off-by: chaunceyjiang --- vllm/tool_parsers/glm4_moe_tool_parser.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/tool_parsers/glm4_moe_tool_parser.py b/vllm/tool_parsers/glm4_moe_tool_parser.py index d254fcb5240a5..ebfd91297b417 100644 --- a/vllm/tool_parsers/glm4_moe_tool_parser.py +++ b/vllm/tool_parsers/glm4_moe_tool_parser.py @@ -114,7 +114,8 @@ class Glm4MoeModelToolParser(ToolParser): ToolCall( type="function", function=FunctionCall( - name=tc_name, arguments=json.dumps(arg_dct) + name=tc_name, + arguments=json.dumps(arg_dct, ensure_ascii=False), ), ) ) From 30bb19a760d6d5e8c69b3a4c78c9cb7430872a61 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Thu, 18 Dec 2025 02:50:15 -0500 Subject: [PATCH 056/176] [BugFix] Partial revert of #29558 (DeepEP HT + PIECEWISE CG support) (#30910) Signed-off-by: Lucas Wilkinson Signed-off-by: Lucas Wilkinson Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- tests/compile/test_config.py | 38 --------------------------- vllm/config/compilation.py | 50 ++++++++++-------------------------- 2 files changed, 14 insertions(+), 74 deletions(-) diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py index 04bb56ecb6470..6435d87ba7631 100644 --- a/tests/compile/test_config.py +++ b/tests/compile/test_config.py @@ -233,24 +233,6 @@ def test_splitting_ops_dynamic(): assert config.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE -def test_moe_splitting_ops_deepep_ht_piecewise(): - # Non-inductor, non-attn-fusion case: DeepEP HT with dp>1 - # should add MoE ops to splitting_ops on top of attention ops. - config = VllmConfig( - parallel_config=ParallelConfig( - all2all_backend="deepep_high_throughput", - data_parallel_size=8, - ), - compilation_config=CompilationConfig( - mode=CompilationMode.VLLM_COMPILE, - ), - ) - splitting_ops = config.compilation_config.splitting_ops - assert splitting_ops is not None - assert "vllm::moe_forward" in splitting_ops - assert "vllm::moe_forward_shared" in splitting_ops - - def test_moe_splitting_ops_deepep_ht_inductor_partition(): # Inductor partition case: user-provided splitting_ops should be # preserved and MoE ops should be appended for DeepEP HT with dp>1. @@ -277,26 +259,6 @@ def test_moe_splitting_ops_deepep_ht_inductor_partition(): ] -def test_moe_splitting_ops_deepep_ht_attn_fusion_no_inductor(): - # Pure attn-fusion case without inductor partition: even with - # DeepEP HT and dp>1, we should not re-enable piecewise compilation - # or add MoE ops into splitting_ops. - config = VllmConfig( - parallel_config=ParallelConfig( - all2all_backend="deepep_high_throughput", - data_parallel_size=8, - ), - compilation_config=CompilationConfig( - mode=CompilationMode.VLLM_COMPILE, - pass_config={"fuse_attn_quant": True, "eliminate_noops": True}, - custom_ops=["+quant_fp8"], - cudagraph_mode=CUDAGraphMode.PIECEWISE, - ), - ) - assert config.compilation_config.splitting_ops == [] - assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL - - def test_should_split(): import torch diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 3e3ee1e572ec8..4676039b23961 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -915,8 +915,6 @@ class CompilationConfig: "mode is CompilationMode.VLLM_COMPILE" ) - added_default_splitting_ops = False - if self.pass_config.fuse_attn_quant and not self.use_inductor_graph_partition: self.set_splitting_ops_for_attn_fusion() else: @@ -930,7 +928,6 @@ class CompilationConfig: # for details. Make a copy to avoid mutating the class-level # list via reference. self.splitting_ops = list(self._attention_ops) - added_default_splitting_ops = True elif len(self.splitting_ops) == 0: if ( self.cudagraph_mode == CUDAGraphMode.PIECEWISE @@ -958,44 +955,25 @@ class CompilationConfig: self.cudagraph_mode = CUDAGraphMode.FULL self.splitting_ops = [] - # split MoE ops for cudagraph - moe_ops = [ - "vllm::moe_forward", - "vllm::moe_forward_shared", - ] + # Disable CUDA graphs for DeepEP high-throughput since its not CG compatible backend = all2all_backend or envs.VLLM_ALL2ALL_BACKEND dp_size = data_parallel_size if data_parallel_size is not None else 1 - need_moe_splitting = ( + if ( backend == "deepep_high_throughput" and dp_size > 1 - # pure attn-fusion without inductor partition deliberately disables - # piecewise graphs and MoE splitting. - and not ( - self.pass_config.fuse_attn_quant - and not self.use_inductor_graph_partition + and self.cudagraph_mode != CUDAGraphMode.NONE + ): + # TODO: Piecewise Cuda graph might be enabled + # if torch compile cache key issue fixed + # See https://github.com/vllm-project/vllm/pull/25093 + logger.info( + "DeepEP: Disabling CUDA Graphs since DeepEP high-throughput kernels " + "are optimized for prefill and are incompatible with CUDA Graphs. " + "In order to use CUDA Graphs for decode-optimized workloads, " + "use --all2all-backend with another option, such as " + "deepep_low_latency, pplx, or allgather_reducescatter." ) - ) - - if need_moe_splitting and self.cudagraph_mode != CUDAGraphMode.NONE: - # if we just initialized default splitting_ops for this config, - # automatically append the MoE ops - if added_default_splitting_ops: - for op in moe_ops: - if op not in self.splitting_ops: - self.splitting_ops.append(op) - - # make sure MoE ops are split out - if not any(op in self.splitting_ops for op in moe_ops): - self.cudagraph_mode = CUDAGraphMode.NONE - logger.warning_once( - "DeepEP high throughput backend with data_parallel_size > 1 " - "requires splitting MoE ops from cudagraphs. Please ensure " - "'vllm::moe_forward' or 'vllm::moe_forward_shared' are " - "present in CompilationConfig.splitting_ops." - ) - elif self.cudagraph_mode.has_full_cudagraphs(): - # fall back to piecewise when MoE splitting is required. - self.cudagraph_mode = CUDAGraphMode.PIECEWISE + self.cudagraph_mode = CUDAGraphMode.NONE def set_splitting_ops_for_attn_fusion(self): assert self.pass_config.fuse_attn_quant From 8da6ae49c3d99e147bc739e54fe57e43581d887b Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Thu, 18 Dec 2025 02:45:51 -0600 Subject: [PATCH 057/176] [ROCm][Bugfix] Fix `fa_version` argument error in `flash_attn_maxseqlen_wrapper` for ROCm without aiter (#30909) Signed-off-by: Andreas Karatzas --- vllm/attention/ops/vit_attn_wrappers.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/attention/ops/vit_attn_wrappers.py b/vllm/attention/ops/vit_attn_wrappers.py index 5a74e1310133d..f555147bc055a 100644 --- a/vllm/attention/ops/vit_attn_wrappers.py +++ b/vllm/attention/ops/vit_attn_wrappers.py @@ -28,7 +28,7 @@ def flash_attn_maxseqlen_wrapper( max_seqlen: torch.Tensor, batch_size: int, is_rocm_aiter: bool, - fa_version: int, + fa_version: int | None, ) -> torch.Tensor: kwargs = {} if is_rocm_aiter: @@ -36,7 +36,8 @@ def flash_attn_maxseqlen_wrapper( else: from vllm.attention.utils.fa_utils import flash_attn_varlen_func - kwargs["fa_version"] = fa_version + if not current_platform.is_rocm() and fa_version is not None: + kwargs["fa_version"] = fa_version q, k, v = (einops.rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) output = flash_attn_varlen_func( q, @@ -62,7 +63,7 @@ def flash_attn_maxseqlen_wrapper_fake( max_seqlen: torch.Tensor, batch_size: int, is_rocm_aiter: bool, - fa_version: int, + fa_version: int | None, ) -> torch.Tensor: return torch.empty_like(q) @@ -82,7 +83,7 @@ def vit_flash_attn_wrapper( max_seqlen: torch.Tensor, batch_size: int, is_rocm_aiter: bool, - fa_version: int, + fa_version: int | None, ) -> torch.Tensor: return torch.ops.vllm.flash_attn_maxseqlen_wrapper( q, k, v, cu_seqlens, max_seqlen, batch_size, is_rocm_aiter, fa_version From 8372be2828f16dd339b24d46cb6142c9d0afd004 Mon Sep 17 00:00:00 2001 From: Ming Yang Date: Thu, 18 Dec 2025 01:02:38 -0800 Subject: [PATCH 058/176] [moe] Use enable_chunking func (to support disabling chunking) (#29935) Signed-off-by: Ming Yang --- vllm/model_executor/layers/fused_moe/modular_kernel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index b0834e861338f..25308b3106a44 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -743,7 +743,7 @@ class FusedMoEModularKernel(torch.nn.Module): 1, ( M - if not self.fused_experts.supports_chunking() + if not self.fused_experts.enable_chunking() else min(M, envs.VLLM_FUSED_MOE_CHUNK_SIZE) ), ) @@ -786,7 +786,7 @@ class FusedMoEModularKernel(torch.nn.Module): is_forward_context_available() and get_forward_context().attn_metadata is None ) - if is_profile_run and self.fused_experts.supports_chunking() and self.is_dp_ep: + if is_profile_run and self.fused_experts.enable_chunking() and self.is_dp_ep: max_workspace_13, max_workspace_2, max_fused_out_shape = ( self.fused_experts.workspace_shapes( envs.VLLM_FUSED_MOE_CHUNK_SIZE, From f90d3636e285a78fbc452f081ea2f55811df1c72 Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Thu, 18 Dec 2025 17:38:22 +0800 Subject: [PATCH 059/176] [Bugfix][CPU] Fix Mac CPU build (#30955) Signed-off-by: jiang1.li --- csrc/cpu/utils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp index fcd7534ab4c5d..88bc3c509790c 100644 --- a/csrc/cpu/utils.cpp +++ b/csrc/cpu/utils.cpp @@ -138,6 +138,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) { return ss.str(); } +#endif // VLLM_NUMA_DISABLED namespace cpu_utils { ScratchPadManager::ScratchPadManager() : size_(0), ptr_(nullptr) { @@ -160,4 +161,3 @@ ScratchPadManager* ScratchPadManager::get_scratchpad_manager() { return &manager; } } // namespace cpu_utils -#endif From 96bf50a2c0142597e83de39503ccb7cfc7732d95 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Thu, 18 Dec 2025 19:47:46 +0800 Subject: [PATCH 060/176] [ROCm] Serving Fails on Radeon Due to AITER Dtype Import (#30952) Signed-off-by: vllmellm --- vllm/_aiter_ops.py | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py index c32bf04c71c1f..0eae279acf5be 100644 --- a/vllm/_aiter_ops.py +++ b/vllm/_aiter_ops.py @@ -24,14 +24,13 @@ def is_aiter_found() -> bool: # we keep this global outside to not cause torch compile breaks. IS_AITER_FOUND = is_aiter_found() -# Can't use dtypes.fp8 directly inside an op -# because it returns wrong result on gfx942. -# This is a workaround to get the correct FP8 dtype. -# This might because that the get_gfx() is wrapped as a custom op. -if IS_AITER_FOUND: - from aiter import dtypes - AITER_FP8_DTYPE = dtypes.fp8 +def is_aiter_found_and_supported() -> bool: + if current_platform.is_rocm() and IS_AITER_FOUND: + from vllm.platforms.rocm import on_gfx9 + + return on_gfx9() + return False def if_aiter_supported(func: Callable) -> Callable: @@ -43,17 +42,24 @@ def if_aiter_supported(func: Callable) -> Callable: def wrapper(*args, **kwargs): # checks the platform, device arch and aiter library existence. - if current_platform.is_rocm() and IS_AITER_FOUND: - from vllm.platforms.rocm import on_gfx9 - - if on_gfx9(): - return func(*args, **kwargs) + if is_aiter_found_and_supported(): + return func(*args, **kwargs) return None return wrapper +# Can't use dtypes.fp8 directly inside an op +# because it returns wrong result on gfx942. +# This is a workaround to get the correct FP8 dtype. +# This might because that the get_gfx() is wrapped as a custom op. +if is_aiter_found_and_supported(): + from aiter import dtypes + + AITER_FP8_DTYPE = dtypes.fp8 + + def _rocm_aiter_fused_moe_impl( hidden_states: torch.Tensor, w1: torch.Tensor, From 100f93d2bea44097916c568a7ac642e7fee915ed Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 18 Dec 2025 09:51:17 -0500 Subject: [PATCH 061/176] Filter safetensors files to download if .safetensors.index.json exists (#30537) Signed-off-by: mgoin --- .../model_loader/weight_utils.py | 35 +++++++++++++++---- 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 610e6a620ade2..0c5961561a7d9 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -23,6 +23,7 @@ import torch from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download from safetensors.torch import load, load_file, safe_open, save_file from tqdm.auto import tqdm +from transformers.utils import SAFE_WEIGHTS_INDEX_NAME from vllm import envs from vllm.config import ModelConfig @@ -448,12 +449,31 @@ def download_weights_from_hf( fs = HfFileSystem() file_list = fs.ls(model_name_or_path, detail=False, revision=revision) - # Use the first pattern found in the HF repo's files. - for pattern in allow_patterns: - matching = fnmatch.filter(file_list, pattern) - if len(matching) > 0: - allow_patterns = [pattern] - break + # If downloading safetensors and an index file exists, use the + # specific file names from the index to avoid downloading + # unnecessary files (e.g., from subdirectories like "original/"). + index_file = f"{model_name_or_path}/{SAFE_WEIGHTS_INDEX_NAME}" + if "*.safetensors" in allow_patterns and index_file in file_list: + index_path = hf_hub_download( + repo_id=model_name_or_path, + filename=SAFE_WEIGHTS_INDEX_NAME, + cache_dir=cache_dir, + revision=revision, + ) + with open(index_path) as f: + weight_map = json.load(f)["weight_map"] + if weight_map: + # Extra [] so that weight_map files are treated as a + # single allow_pattern in the loop below + allow_patterns = [list(set(weight_map.values()))] # type: ignore[list-item] + else: + allow_patterns = ["*.safetensors"] + else: + # Use the first pattern found in the HF repo's files. + for pattern in allow_patterns: + if fnmatch.filter(file_list, pattern): + allow_patterns = [pattern] + break except Exception as e: logger.warning( "Failed to get file list for '%s'. Trying each pattern in " @@ -480,6 +500,9 @@ def download_weights_from_hf( ) # If we have downloaded weights for this allow_pattern, # we don't need to check the rest. + # allow_pattern can be a list (from weight_map) or str (glob) + if isinstance(allow_pattern, list): + break if any(Path(hf_folder).glob(allow_pattern)): break time_taken = time.perf_counter() - start_time From eee600c34f87f50e87c7268eab569af16c7c2d22 Mon Sep 17 00:00:00 2001 From: zhrrr <43847754+izhuhaoran@users.noreply.github.com> Date: Thu, 18 Dec 2025 22:52:20 +0800 Subject: [PATCH 062/176] [Misc] support nsys profile for bench latency (#29776) Signed-off-by: zhuhaoran --- vllm/benchmarks/latency.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py index 99c1c846f19af..a9d149666e8ba 100644 --- a/vllm/benchmarks/latency.py +++ b/vllm/benchmarks/latency.py @@ -79,10 +79,6 @@ def add_cli_args(parser: argparse.ArgumentParser): def main(args: argparse.Namespace): engine_args = EngineArgs.from_cli_args(args) - if args.profile and not engine_args.profiler_config.profiler == "torch": - raise ValueError( - "The torch profiler is not enabled. Please provide profiler_config." - ) # Lazy import to avoid importing LLM when the bench command is not selected. from vllm import LLM, SamplingParams @@ -125,8 +121,8 @@ def main(args: argparse.Namespace): ), ) - def run_to_completion(profile_dir: str | None = None): - if profile_dir: + def run_to_completion(do_profile: bool = False): + if do_profile: llm.start_profile() llm_generate() llm.stop_profile() @@ -139,18 +135,24 @@ def main(args: argparse.Namespace): print("Warming up...") for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"): - run_to_completion(profile_dir=None) + run_to_completion(do_profile=False) if args.profile: - profile_dir = engine_args.profiler_config.torch_profiler_dir - print(f"Profiling (results will be saved to '{profile_dir}')...") - run_to_completion(profile_dir=profile_dir) + profiler_config = engine_args.profiler_config + if profiler_config.profiler == "torch": + print( + "Profiling with torch profiler (results will be saved to" + f" {profiler_config.torch_profiler_dir})..." + ) + elif profiler_config.profiler == "cuda": + print("Profiling with cuda profiler ...") + run_to_completion(do_profile=True) return # Benchmark. latencies = [] - for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): - latencies.append(run_to_completion(profile_dir=None)) + for _ in tqdm(range(args.num_iters), desc="Bench iterations"): + latencies.append(run_to_completion(do_profile=False)) latencies = np.array(latencies) percentages = [10, 25, 50, 75, 90, 99] percentiles = np.percentile(latencies, percentages) From 66287582339d1f27c951f0108aac9aa7377fe643 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Thu, 18 Dec 2025 10:27:51 -0500 Subject: [PATCH 063/176] [Bug] Fix batch invariant in torch 2.10 (#30907) Signed-off-by: yewentao256 Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: Cyrus Leung --- vllm/model_executor/layers/batch_invariant.py | 44 +++++++++---------- 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py index fde0826779eb1..1058270889b29 100644 --- a/vllm/model_executor/layers/batch_invariant.py +++ b/vllm/model_executor/layers/batch_invariant.py @@ -933,30 +933,26 @@ def enable_batch_invariant_mode(): _batch_invariant_MODE = True _batch_invariant_LIB = torch.library.Library("aten", "IMPL") - # Batch invariant matmuls are no longer needed after cublas overrides - if not is_torch_equal_or_newer("2.10.0.dev"): - if ( - current_platform.is_device_capability_family(100) - or current_platform.is_device_capability(80) - or current_platform.is_device_capability(89) - ): - # For PyTorch 2.9, B200 uses GEMV for bs=1 - # Requires https://github.com/pytorch/pytorch/pull/166735 - _batch_invariant_LIB.impl("aten::mm", mm_batch_invariant, "CUDA") - _batch_invariant_LIB.impl("aten::addmm", addmm_batch_invariant, "CUDA") - _batch_invariant_LIB.impl("aten::matmul", matmul_batch_invariant, "CUDA") - _batch_invariant_LIB.impl("aten::linear", linear_batch_invariant, "CUDA") - else: - # Only source of batch invariance for Hopper is split-k, can disable through - # cuBLAS workspace config - _original_cublas_workspace_cfg = os.environ.get( - "CUBLAS_WORKSPACE_CONFIG", None - ) - _original_cublaslt_workspace_size = os.environ.get( - "CUBLASLT_WORKSPACE_SIZE", None - ) - os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8" - os.environ["CUBLASLT_WORKSPACE_SIZE"] = "1" + if ( + current_platform.is_device_capability_family(100) + or current_platform.is_device_capability(80) + or current_platform.is_device_capability(89) + ): + # For PyTorch 2.9, B200 uses GEMV for bs=1 + # Requires https://github.com/pytorch/pytorch/pull/166735 + _batch_invariant_LIB.impl("aten::mm", mm_batch_invariant, "CUDA") + _batch_invariant_LIB.impl("aten::addmm", addmm_batch_invariant, "CUDA") + _batch_invariant_LIB.impl("aten::matmul", matmul_batch_invariant, "CUDA") + _batch_invariant_LIB.impl("aten::linear", linear_batch_invariant, "CUDA") + else: + # Only source of batch invariance for Hopper is split-k, can disable through + # cuBLAS workspace config + _original_cublas_workspace_cfg = os.environ.get("CUBLAS_WORKSPACE_CONFIG", None) + _original_cublaslt_workspace_size = os.environ.get( + "CUBLASLT_WORKSPACE_SIZE", None + ) + os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8" + os.environ["CUBLASLT_WORKSPACE_SIZE"] = "1" _batch_invariant_LIB.impl( "aten::_log_softmax", _log_softmax_batch_invariant, "CUDA" From 28d15ab56bd9d3fd17010bc4abaeec06988f7887 Mon Sep 17 00:00:00 2001 From: sarathc-cerebras Date: Thu, 18 Dec 2025 21:16:58 +0530 Subject: [PATCH 064/176] adds jais 2 support (#30188) Signed-off-by: sarathc-cerebras Co-authored-by: Cyrus Leung --- docs/models/supported_models.md | 1 + tests/models/registry.py | 3 + vllm/model_executor/models/jais2.py | 529 +++++++++++++++++++++++++ vllm/model_executor/models/registry.py | 1 + 4 files changed, 534 insertions(+) create mode 100644 vllm/model_executor/models/jais2.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 9ba0f4ca9096e..3ffbf63f9a18b 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -406,6 +406,7 @@ th { | `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | | `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | | `JAISLMHeadModel` | Jais | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. | | ✅︎ | +| `Jais2ForCausalLM` | Jais2 | `inceptionai/Jais-2-8B-Chat`, `inceptionai/Jais-2-70B-Chat`, etc. | | ✅︎ | | `JambaForCausalLM` | Jamba | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. | ✅︎ | ✅︎ | | `KimiLinearForCausalLM` | Kimi-Linear-48B-A3B-Base, Kimi-Linear-48B-A3B-Instruct | `moonshotai/Kimi-Linear-48B-A3B-Base`, `moonshotai/Kimi-Linear-48B-A3B-Instruct` | | ✅︎ | | `Lfm2ForCausalLM` | LFM2 | `LiquidAI/LFM2-1.2B`, `LiquidAI/LFM2-700M`, `LiquidAI/LFM2-350M`, etc. | ✅︎ | ✅︎ | diff --git a/tests/models/registry.py b/tests/models/registry.py index c5d72b5d581b9..fa70e94abd865 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -295,6 +295,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "internlm/internlm3-8b-instruct", trust_remote_code=True ), "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"), + "Jais2ForCausalLM": _HfExamplesInfo( + "inceptionai/Jais-2-8B-Chat", min_transformers_version="4.58" + ), "JambaForCausalLM": _HfExamplesInfo( "ai21labs/AI21-Jamba-1.5-Mini", extras={ diff --git a/vllm/model_executor/models/jais2.py b/vllm/model_executor/models/jais2.py new file mode 100644 index 0000000000000..01e75338a8ced --- /dev/null +++ b/vllm/model_executor/models/jais2.py @@ -0,0 +1,529 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Inference-only Jais2 model compatible with HuggingFace weights.""" + +from collections.abc import Iterable + +import torch +from torch import nn +from transformers import Jais2Config + +from vllm.attention.layer import Attention +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, VllmConfig +from vllm.distributed import ( + get_pp_group, + get_tensor_model_parallel_world_size, +) +from vllm.model_executor.layers.activation import ReLUSquaredActivation +from vllm.model_executor.layers.linear import ( + ColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear, +) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.vocab_parallel_embedding import ( + DEFAULT_VOCAB_PADDING_SIZE, + ParallelLMHead, + VocabParallelEmbedding, +) +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, + maybe_remap_kv_scale_name, +) +from vllm.sequence import IntermediateTensors + +from .interfaces import SupportsLoRA, SupportsPP +from .utils import ( + AutoWeightsLoader, + PPMissingLayer, + extract_layer_index, + is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, + make_layers, + maybe_prefix, +) + + +class Jais2MLP(nn.Module): + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + quant_config: QuantizationConfig | None = None, + bias: bool = False, + prefix: str = "", + ) -> None: + super().__init__() + self.up_proj = ColumnParallelLinear( + input_size=hidden_size, + output_size=intermediate_size, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.up_proj", + ) + self.down_proj = RowParallelLinear( + input_size=intermediate_size, + output_size=hidden_size, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.down_proj", + ) + self.act_fn = ReLUSquaredActivation() + + def forward(self, x): + x, _ = self.up_proj(x) + x = self.act_fn(x) + x, _ = self.down_proj(x) + return x + + +class Jais2Attention(nn.Module): + def __init__( + self, + config: Jais2Config, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + max_position_embeddings: int = 8192, + quant_config: QuantizationConfig | None = None, + bias: bool = False, + cache_config: CacheConfig | None = None, + prefix: str = "", + ) -> None: + super().__init__() + layer_idx = extract_layer_index(prefix) + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + # MistralConfig has an optional head_dim introduced by Mistral-Nemo + self.head_dim = getattr( + config, "head_dim", self.hidden_size // self.total_num_heads + ) + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.max_position_embeddings = max_position_embeddings + + self.qkv_proj = QKVParallelLinear( + hidden_size=hidden_size, + head_size=self.head_dim, + total_num_heads=self.total_num_heads, + total_num_kv_heads=self.total_num_kv_heads, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + + self.o_proj = RowParallelLinear( + input_size=self.total_num_heads * self.head_dim, + output_size=hidden_size, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + + is_neox_style = True + if quant_config is not None and quant_config.get_name() == "gguf": + is_neox_style = False + + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=max_position_embeddings, + rope_parameters=getattr(config, "rope_parameters", None), + is_neox_style=is_neox_style, + ) + + if hasattr(config, "interleaved_sliding_window"): + interleaved_sliding_window = config.interleaved_sliding_window + if isinstance(interleaved_sliding_window, int): + sliding_window = interleaved_sliding_window + elif isinstance(interleaved_sliding_window, list): + sw_idx = layer_idx % len(interleaved_sliding_window) + sliding_window = interleaved_sliding_window[sw_idx] + else: + raise ValueError( + f"{type(interleaved_sliding_window)} is not supported." + ) + else: + sliding_window = None + + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + quant_config=quant_config, + per_layer_sliding_window=sliding_window, + prefix=f"{prefix}.attn", + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = self.rotary_emb(positions, q, k) + attn_output = self.attn(q, k, v) + output, _ = self.o_proj(attn_output) + return output + + +class Jais2DecoderLayer(nn.Module): + def __init__( + self, + vllm_config: VllmConfig, + config: Jais2Config, + prefix: str = "", + ) -> None: + super().__init__() + + config = config or vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = self.get_quant_config(vllm_config) + + self.hidden_size = config.hidden_size + max_position_embeddings = getattr(config, "max_position_embeddings", 8192) + # Support abacusai/Smaug-72B-v0.1 with attention_bias + # Support internlm/internlm-7b with bias + attention_bias = getattr(config, "attention_bias", False) or getattr( + config, "bias", False + ) + self.self_attn = Jais2Attention( + config=config, + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=getattr( + config, "num_key_value_heads", config.num_attention_heads + ), + max_position_embeddings=max_position_embeddings, + quant_config=quant_config, + bias=attention_bias, + cache_config=cache_config, + prefix=f"{prefix}.self_attn", + ) + self.mlp = Jais2MLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + bias=getattr(config, "mlp_bias", False), + prefix=f"{prefix}.mlp", + ) + self.input_layernorm = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_eps + ) + self.post_attention_layernorm = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_eps + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: torch.Tensor | None, + ) -> tuple[torch.Tensor, torch.Tensor]: + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = ( + self.input_layernorm(hidden_states + residual), + hidden_states + residual, + ) + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + ) + + # Fully Connected + hidden_states, residual = ( + self.post_attention_layernorm(hidden_states + residual), + hidden_states + residual, + ) + hidden_states = self.mlp(hidden_states) + return hidden_states, residual + + def get_quant_config(self, vllm_config: VllmConfig) -> QuantizationConfig | None: + """Get quantization config for this layer. Override in subclasses.""" + return vllm_config.quant_config + + +@support_torch_compile +class Jais2Model(nn.Module): + def __init__( + self, + vllm_config: VllmConfig, + prefix: str = "", + layer_type: type[nn.Module] = Jais2DecoderLayer, + ): + super().__init__() + + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + + self.config = config + self.quant_config = quant_config + self.padding_idx = config.pad_token_id + lora_vocab = ( + (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) + if lora_config + else 0 + ) + self.vocab_size = config.vocab_size + lora_vocab + self.org_vocab_size = config.vocab_size + if get_pp_group().is_first_rank or ( + config.tie_word_embeddings and get_pp_group().is_last_rank + ): + self.embed_tokens = VocabParallelEmbedding( + self.vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + quant_config=quant_config, + ) + else: + self.embed_tokens = PPMissingLayer() + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + lambda prefix: layer_type( + config=config, + vllm_config=vllm_config, + prefix=prefix, + ), + prefix=f"{prefix}.layers", + ) + if get_pp_group().is_last_rank: + self.norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + else: + self.norm = PPMissingLayer() + + self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size + ) + + def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: torch.Tensor | None, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None, + inputs_embeds: torch.Tensor | None = None, + ) -> torch.Tensor | IntermediateTensors | tuple[torch.Tensor, list[torch.Tensor]]: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.embed_input_ids(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + for i in range(self.start_layer, self.end_layer): + layer = self.layers[i] + hidden_states, residual = layer(positions, hidden_states, residual) + + if not get_pp_group().is_last_rank: + return IntermediateTensors( + {"hidden_states": hidden_states, "residual": residual} + ) + + hidden_states, _ = self.norm(hidden_states + residual), residual + return hidden_states + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + (".qkv_proj", ".q_proj", "q"), + (".qkv_proj", ".k_proj", "k"), + (".qkv_proj", ".v_proj", "v"), + ] + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name: + # Models trained using ColossalAI may include these tensors in + # the checkpoint. Skip them. + continue + if self.quant_config is not None and ( + scale_name := self.quant_config.get_cache_scale(name) + ): + # Loading kv cache scales for compressed-tensors quantization + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + loaded_weight = loaded_weight[0] + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue + if "scale" in name: + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +class Jais2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + } + + embedding_modules = { + "embed_tokens": "input_embeddings", + "lm_head": "output_embeddings", + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + self.config = config + self.lora_config = lora_config + + self.model = self._init_model( + vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") + ) + + if get_pp_group().is_last_rank: + self.unpadded_vocab_size = config.vocab_size + if lora_config: + self.unpadded_vocab_size += lora_config.lora_extra_vocab_size + self.lm_head = ParallelLMHead( + self.unpadded_vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + padding_size=( + DEFAULT_VOCAB_PADDING_SIZE + # We need bigger padding if using lora for kernel + # compatibility + if not lora_config + else lora_config.lora_vocab_padding_size + ), + quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head"), + ) + if config.tie_word_embeddings: + self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens) + + logit_scale = getattr(config, "logit_scale", 1.0) + self.logits_processor = LogitsProcessor( + self.unpadded_vocab_size, config.vocab_size, logit_scale + ) + else: + self.lm_head = PPMissingLayer() + + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors + ) + + def _init_model(self, vllm_config: VllmConfig, prefix: str = ""): + return Jais2Model(vllm_config=vllm_config, prefix=prefix) + + def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.embed_input_ids(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + ) -> torch.Tensor | IntermediateTensors: + model_output = self.model( + input_ids, positions, intermediate_tensors, inputs_embeds + ) + return model_output + + def compute_logits( + self, + hidden_states: torch.Tensor, + ) -> torch.Tensor | None: + logits = self.logits_processor(self.lm_head, hidden_states) + return logits + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader( + self, + skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None), + ) + return loader.load_weights(weights) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 4575e91e13959..d332f51152484 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -127,6 +127,7 @@ _TEXT_GENERATION_MODELS = { "InternLM2VEForCausalLM": ("internlm2_ve", "InternLM2VEForCausalLM"), "InternLM3ForCausalLM": ("llama", "LlamaForCausalLM"), "JAISLMHeadModel": ("jais", "JAISLMHeadModel"), + "Jais2ForCausalLM": ("jais2", "Jais2ForCausalLM"), "JambaForCausalLM": ("jamba", "JambaForCausalLM"), "KimiLinearForCausalLM": ("kimi_linear", "KimiLinearForCausalLM"), # noqa: E501 "Lfm2ForCausalLM": ("lfm2", "Lfm2ForCausalLM"), From 0db5439ded739731474009217a7515fd52c221c8 Mon Sep 17 00:00:00 2001 From: Lucas Kabela Date: Thu, 18 Dec 2025 08:23:31 -0800 Subject: [PATCH 065/176] [Bugfix][torch2.10] Fix test_qwen2_5_vl_compilation with 2.10 RC (#30822) Signed-off-by: Lucas Kabela Co-authored-by: Cyrus Leung --- vllm/compilation/backends.py | 7 ++++--- vllm/compilation/caching.py | 10 ++++++++-- vllm/compilation/piecewise_backend.py | 3 +-- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index a1eec7d74483f..2fb6265560b19 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -520,6 +520,7 @@ class VllmBackend: self, vllm_config: VllmConfig, prefix: str = "", + is_encoder: bool = False, ): # if the model is initialized with a non-empty prefix, # then usually it's enough to use that prefix, @@ -530,7 +531,7 @@ class VllmBackend: self.prefix = prefix or model_tag # Mark compilation for encoder. - self.is_encoder = model_is_encoder + self.is_encoder = is_encoder or model_is_encoder # Passes to run on the graph post-grad. self.pass_manager = resolve_obj_by_qualname( @@ -797,7 +798,7 @@ class VllmBackend: or not self.compilation_config.cudagraph_copy_inputs ): return VllmSerializableFunction( - graph, example_inputs, self.prefix, self.split_gm + graph, example_inputs, self.prefix, self.split_gm, self.is_encoder ) # index of tensors that have symbolic shapes (batch size) @@ -835,5 +836,5 @@ class VllmBackend: return self.split_gm(*list_args) return VllmSerializableFunction( - graph, example_inputs, self.prefix, copy_and_call + graph, example_inputs, self.prefix, copy_and_call, self.is_encoder ) diff --git a/vllm/compilation/caching.py b/vllm/compilation/caching.py index fc02a08f74265..8c9ec87bcad56 100644 --- a/vllm/compilation/caching.py +++ b/vllm/compilation/caching.py @@ -37,12 +37,15 @@ class VllmSerializableFunction(SerializableCallable): serializing the Dynamo fx graph plus example inputs. """ - def __init__(self, graph_module, example_inputs, prefix, optimized_call): + def __init__( + self, graph_module, example_inputs, prefix, optimized_call, is_encoder=False + ): assert isinstance(graph_module, torch.fx.GraphModule) self.graph_module = graph_module self.example_inputs = example_inputs self.prefix = prefix self.optimized_call = optimized_call + self.is_encoder = is_encoder self.shape_env = None sym_input = next( (i for i in self.example_inputs if isinstance(i, torch.SymInt)), None @@ -106,7 +109,10 @@ class VllmSerializableFunction(SerializableCallable): state["graph_module"] = GraphPickler.loads(state["graph_module"], fake_mode) state["graph_module"].recompile() state["example_inputs"] = GraphPickler.loads(state["example_inputs"], fake_mode) - vllm_backend = VllmBackend(get_current_vllm_config(), state["prefix"]) + is_encoder = state.get("is_encoder", False) + vllm_backend = VllmBackend( + get_current_vllm_config(), state["prefix"], is_encoder + ) def optimized_call(*example_inputs): """ diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py index 58d3e2a14b22a..12cc49971e08b 100644 --- a/vllm/compilation/piecewise_backend.py +++ b/vllm/compilation/piecewise_backend.py @@ -170,8 +170,7 @@ class PiecewiseBackend: range_entry = self._find_range_for_shape(runtime_shape) assert range_entry is not None, ( - f"Shape out of considered range: {runtime_shape} " - "[1, max_num_batched_tokens]" + f"Shape: {runtime_shape} out of considered ranges: {self.compile_ranges}" ) self._maybe_compile_for_range_entry(range_entry, args) From 326e7c31055812277957e3e2b43715b4f366facb Mon Sep 17 00:00:00 2001 From: wzyrrr <53074341+wzyrrr@users.noreply.github.com> Date: Fri, 19 Dec 2025 00:29:33 +0800 Subject: [PATCH 066/176] [Doc] Add Sophgo TPU Support (#30949) Co-authored-by: zhaoyang.wang --- docs/getting_started/installation/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/getting_started/installation/README.md b/docs/getting_started/installation/README.md index cff7ce1a882a1..9b93a6b9ac12c 100644 --- a/docs/getting_started/installation/README.md +++ b/docs/getting_started/installation/README.md @@ -27,3 +27,4 @@ The backends below live **outside** the main `vllm` repository and follow the | IBM Spyre AIU | `vllm-spyre` | | | Cambricon MLU | `vllm-mlu` | | | Baidu Kunlun XPU | N/A, install from source | | +| Sophgo TPU | N/A, install from source | | From 9a5e96523be33f7ddd5aa56421b1e41000c0f2e2 Mon Sep 17 00:00:00 2001 From: Xin Yang <105740670+xyang16@users.noreply.github.com> Date: Thu, 18 Dec 2025 08:42:22 -0800 Subject: [PATCH 067/176] [LoRA] Set default MXFP4 LoRA backend to Marlin (#30598) Signed-off-by: Xin Yang Co-authored-by: Cyrus Leung --- vllm/model_executor/layers/quantization/mxfp4.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index e96e87d15787d..832925825c453 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -95,12 +95,12 @@ def get_mxfp4_backend_with_lora() -> Mxfp4Backend: # SM120 needs this fix: https://github.com/triton-lang/triton/pull/8498 and (9, 0) <= current_platform.get_device_capability() < (11, 0) ) - if envs.VLLM_MXFP4_USE_MARLIN or not triton_kernels_supported: - logger.info_once("[get_mxfp4_backend_with_lora] Using Marlin backend") - return Mxfp4Backend.MARLIN + if envs.VLLM_MXFP4_USE_MARLIN is False and triton_kernels_supported: + logger.info_once("[get_mxfp4_backend_with_lora] Using Triton backend") + return Mxfp4Backend.TRITON - logger.info_once("[get_mxfp4_backend_with_lora] Using Triton backend") - return Mxfp4Backend.TRITON + logger.info_once("[get_mxfp4_backend_with_lora] Using Marlin backend") + return Mxfp4Backend.MARLIN def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend: From f4ee2c3d908ade890bc2e42945d307bc2c341b59 Mon Sep 17 00:00:00 2001 From: Vasiliy Kuznetsov Date: Thu, 18 Dec 2025 11:45:15 -0500 Subject: [PATCH 068/176] fix fp8 online quantization streaming with tp > 1 (#30900) Signed-off-by: vasiliy --- .../model_executor/layers/quantization/fp8.py | 41 +++++++++++++++---- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 800340ed6043c..ec3fc5ace17d8 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -8,6 +8,7 @@ from typing import TYPE_CHECKING, Any, Optional import torch from torch.nn import Module from torch.nn.parameter import Parameter +from torch.utils._python_dispatch import TorchDispatchMode import vllm.envs as envs import vllm.model_executor.layers.fused_moe.modular_kernel as mk @@ -363,6 +364,26 @@ class Fp8Config(QuantizationConfig): return None +class CopyNumelCounter(TorchDispatchMode): + """ + Tracks total number of elements modified with `copy_`. Useful for keeping + track of weight loading where underlying weights can be arbitrarily + transformed (such as with `narrow`) before calling copy. + """ + + def __init__(self): + super().__init__() + self.copied_numel = 0 + + def __torch_dispatch__(self, func, types, args=(), kwargs=None): + if kwargs is None: + kwargs = {} + out = func(*args, **kwargs) + if func == torch.ops.aten.copy_.default: + self.copied_numel += args[0].numel() + return out + + class Fp8LinearMethod(LinearMethodBase): """Linear method for FP8. Supports loading FP8 checkpoints with static weight scale and @@ -469,13 +490,15 @@ class Fp8LinearMethod(LinearMethodBase): else: def patched_weight_loader(param, loaded_weight, *args, **kwargs): - # load the current weight chunk - res = weight_loader(param, loaded_weight, *args, **kwargs) # type: ignore[misc] - # track how many elements we have updated if not hasattr(layer, "_loaded_numel"): layer._loaded_numel = 0 - layer._loaded_numel += loaded_weight.numel() + + # load the current weight chunk + copy_numel_counter = CopyNumelCounter() + with copy_numel_counter: + res = weight_loader(param, loaded_weight, *args, **kwargs) # type: ignore[misc] + layer._loaded_numel += copy_numel_counter.copied_numel # if we have loaded all of the elements, call # process_weights_after_loading @@ -1348,13 +1371,15 @@ class Fp8OnlineMoEMethod(Fp8MoEMethod): new_extra_weight_attrs = extra_weight_attrs def patched_weight_loader(param, loaded_weight, *args, **kwargs): - # load the current weight chunk - res = weight_loader(param, loaded_weight, *args, **kwargs) # type: ignore[misc] - # add a counter to track how many elements we have updated if not hasattr(layer, "_loaded_numel"): layer._loaded_numel = 0 - layer._loaded_numel += loaded_weight.numel() + + # load the current weight chunk + copy_numel_counter = CopyNumelCounter() + with copy_numel_counter: + res = weight_loader(param, loaded_weight, *args, **kwargs) # type: ignore[misc] + layer._loaded_numel += copy_numel_counter.copied_numel # if we have loaded all of the elements, call # process_weights_after_loading From 686cbaac643c3412036728dd5e6bc29d6cce1a9f Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Thu, 18 Dec 2025 09:17:00 -0800 Subject: [PATCH 069/176] [Cleanup] Remove unused ModelRunner V1 `InputBatch.num_tokens` field (#30218) Signed-off-by: Nick Hill --- vllm/v1/worker/gpu_input_batch.py | 28 +++++++++------------------- vllm/v1/worker/gpu_model_runner.py | 5 ----- vllm/v1/worker/tpu_input_batch.py | 11 +---------- vllm/v1/worker/tpu_model_runner.py | 4 ++-- 4 files changed, 12 insertions(+), 36 deletions(-) diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index ead7a3619dea5..08b595845bb40 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -128,7 +128,6 @@ class InputBatch: # allocation if max_model_len is big. # Maps req_index -> tensor of shape (num_prompt_tokens, hidden_size) self.req_prompt_embeds: dict[int, torch.Tensor] = {} - self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32) self.num_tokens_no_spec = np.zeros(max_num_reqs, dtype=np.int32) self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32) self.num_computed_tokens_cpu_tensor = torch.zeros( @@ -340,9 +339,6 @@ class InputBatch: self.req_prompt_embeds[req_index] = request.prompt_embeds self.token_ids_cpu[req_index, start_idx:end_idx] = request.output_token_ids self.is_token_ids[req_index, start_idx:end_idx] = True - # Number of token ids in prompt (token_ids_cpu or prompt_embeds). - # NOTE(woosuk): This may include spec decode tokens. - self.num_tokens[req_index] = request.num_tokens # Number of tokens without spec decode tokens. self.num_tokens_no_spec[req_index] = request.num_tokens @@ -522,10 +518,6 @@ class InputBatch: self.req_id_to_index[old_id_i2], self.req_id_to_index[old_id_i1], ) - self.num_tokens[i1], self.num_tokens[i2] = ( - self.num_tokens[i2], - self.num_tokens[i1], - ) self.num_tokens_no_spec[i1], self.num_tokens_no_spec[i2] = ( self.num_tokens_no_spec[i2], self.num_tokens_no_spec[i1], @@ -661,17 +653,16 @@ class InputBatch: self.req_output_token_ids[last_req_index] = None self.req_id_to_index[req_id] = empty_index - if last_req_index != empty_index: - ( - self.spec_token_ids[last_req_index], - self.spec_token_ids[empty_index], - ) = ( - self.spec_token_ids[empty_index], - self.spec_token_ids[last_req_index], - ) - self.spec_token_ids[last_req_index].clear() + num_tokens = self.num_tokens_no_spec[last_req_index] + len( + self.spec_token_ids[last_req_index] + ) + + (self.spec_token_ids[last_req_index], self.spec_token_ids[empty_index]) = ( + self.spec_token_ids[empty_index], + self.spec_token_ids[last_req_index], + ) + self.spec_token_ids[last_req_index].clear() - num_tokens = self.num_tokens[last_req_index] self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[ last_req_index, :num_tokens ] @@ -682,7 +673,6 @@ class InputBatch: self.req_prompt_embeds[empty_index] = self.req_prompt_embeds.pop( last_req_index ) - self.num_tokens[empty_index] = num_tokens self.num_tokens_no_spec[empty_index] = self.num_tokens_no_spec[ last_req_index ] diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a44150432434b..36637b98ea823 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -923,7 +923,6 @@ class GPUModelRunner( self.input_batch.num_prompt_tokens[req_index] + num_output_tokens ) - self.input_batch.num_tokens[req_index] = end_idx self.input_batch.num_tokens_no_spec[req_index] = end_idx # Update the block IDs. @@ -968,7 +967,6 @@ class GPUModelRunner( req_index, start_token_index:end_token_index ] = new_token_ids self.input_batch.num_tokens_no_spec[req_index] = end_token_index - self.input_batch.num_tokens[req_index] = end_token_index # Add spec_token_ids to token_ids_cpu. spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get( @@ -984,8 +982,6 @@ class GPUModelRunner( self.input_batch.token_ids_cpu[ req_index, start_index:end_token_index ] = spec_token_ids - # NOTE(woosuk): `num_tokens` here may include spec tokens. - self.input_batch.num_tokens[req_index] += num_spec_tokens # When speculative decoding is used with structured output, # the scheduler can drop draft tokens that do not @@ -2702,7 +2698,6 @@ class GPUModelRunner( self.input_batch.token_ids_cpu[req_idx, start_idx:end_idx] = sampled_ids self.input_batch.is_token_ids[req_idx, start_idx:end_idx] = True self.input_batch.num_tokens_no_spec[req_idx] = end_idx - self.input_batch.num_tokens[req_idx] = end_idx req_id = req_ids[req_idx] req_state = self.requests[req_id] diff --git a/vllm/v1/worker/tpu_input_batch.py b/vllm/v1/worker/tpu_input_batch.py index 2ed65ca9d31cd..3758a73ee4967 100644 --- a/vllm/v1/worker/tpu_input_batch.py +++ b/vllm/v1/worker/tpu_input_batch.py @@ -51,7 +51,6 @@ class InputBatch: pin_memory=False, ) self.token_ids_cpu = self.token_ids_cpu_tensor.numpy() - self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32) self.num_tokens_no_spec = np.zeros(max_num_reqs, dtype=np.int32) self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32) self.num_computed_tokens_cpu_tensor = torch.zeros( @@ -200,9 +199,6 @@ class InputBatch: start_idx = num_prompt_tokens end_idx = start_idx + len(request.output_token_ids) self.token_ids_cpu[req_index, start_idx:end_idx] = request.output_token_ids - # Number of token ids in token_ids_cpu. - # NOTE(woosuk): This may include spec decode tokens. - self.num_tokens[req_index] = request.num_tokens # Number of tokens without spec decode tokens. self.num_tokens_no_spec[req_index] = request.num_tokens @@ -344,10 +340,6 @@ class InputBatch: self.req_id_to_index[old_id_i2], self.req_id_to_index[old_id_i1], ) - self.num_tokens[i1], self.num_tokens[i2] = ( - self.num_tokens[i2], - self.num_tokens[i1], - ) self.num_tokens_no_spec[i1], self.num_tokens_no_spec[i2] = ( self.num_tokens_no_spec[i2], self.num_tokens_no_spec[i1], @@ -448,11 +440,10 @@ class InputBatch: self.req_output_token_ids[last_req_index] = None self.req_id_to_index[req_id] = empty_index - num_tokens = self.num_tokens[last_req_index] + num_tokens = self.num_tokens_no_spec[last_req_index] self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[ last_req_index, :num_tokens ] - self.num_tokens[empty_index] = num_tokens self.num_tokens_no_spec[empty_index] = self.num_tokens_no_spec[ last_req_index ] diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 283f21b779e38..c7404c4642d7e 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -1283,7 +1283,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): token_id = valid_sampled_token_ids[i][0] self.input_batch.token_ids_cpu[i, seq_len] = token_id req_state.output_token_ids.append(token_id) - self.input_batch.num_tokens[i] += 1 + self.input_batch.num_tokens_no_spec[i] += 1 else: valid_mask = selected_token_ids != INVALID_TOKEN_ID @@ -1291,7 +1291,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): valid_sampled_token_ids = [ seq.tolist() for seq in selected_token_ids[valid_mask].split(gen_lens) ] - self.input_batch.num_tokens[:num_reqs] += gen_lens + self.input_batch.num_tokens_no_spec[:num_reqs] += gen_lens for i, req_state, seq_len in request_seq_lens: target_slice = slice(seq_len - gen_lens[i] + 1, seq_len + 1) self.input_batch.token_ids_cpu[i, target_slice] = ( From 500f26e6d35d9743167bc4908c01ca356e543836 Mon Sep 17 00:00:00 2001 From: inkcherry Date: Fri, 19 Dec 2025 01:50:42 +0800 Subject: [PATCH 070/176] [Bugfix] fix DP-aware routing in OpenAI API requests (#29002) Signed-off-by: inkcherry --- tests/entrypoints/openai/test_chat_error.py | 1 + .../openai/test_completion_error.py | 1 + tests/entrypoints/openai/test_serving_chat.py | 1 + tests/v1/engine/test_async_llm.py | 61 +++++++++++++++++++ vllm/entrypoints/openai/serving_chat.py | 1 + vllm/entrypoints/openai/serving_completion.py | 1 + vllm/entrypoints/openai/serving_engine.py | 2 + 7 files changed, 68 insertions(+) diff --git a/tests/entrypoints/openai/test_chat_error.py b/tests/entrypoints/openai/test_chat_error.py index b194e9b74d874..1f30d8cf1e8cc 100644 --- a/tests/entrypoints/openai/test_chat_error.py +++ b/tests/entrypoints/openai/test_chat_error.py @@ -76,6 +76,7 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat: lora_request, trace_headers, priority, + data_parallel_rank, ): return dict(engine_prompt), {} diff --git a/tests/entrypoints/openai/test_completion_error.py b/tests/entrypoints/openai/test_completion_error.py index ca56cc2ddb6a7..6643aa471321b 100644 --- a/tests/entrypoints/openai/test_completion_error.py +++ b/tests/entrypoints/openai/test_completion_error.py @@ -73,6 +73,7 @@ def _build_serving_completion(engine: AsyncLLM) -> OpenAIServingCompletion: lora_request, trace_headers, priority, + data_parallel_rank, ): return dict(engine_prompt), {} diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 2befa40d636da..69d7b1ceedf59 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -396,6 +396,7 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat: lora_request, trace_headers, priority, + data_parallel_rank, ): return dict(engine_prompt), {} diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index 25af55baa91f4..224e5d741024b 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -11,6 +11,13 @@ from vllm import SamplingParams from vllm.assets.image import ImageAsset from vllm.config import VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + ChatCompletionResponse, + ErrorResponse, +) +from vllm.entrypoints.openai.serving_chat import OpenAIServingChat +from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels from vllm.inputs import PromptType from vllm.outputs import RequestOutput from vllm.platforms import current_platform @@ -484,6 +491,60 @@ async def test_dp_rank_argument(): pass +@pytest.mark.asyncio(scope="module") +async def test_header_dp_rank_argument(): + with ExitStack() as after: + with set_default_torch_num_threads(1): + engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS) + after.callback(engine.shutdown) + + MODEL_NAME = "test-model" + BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)] + + # Create models first + models = OpenAIServingModels( + engine_client=engine, + base_model_paths=BASE_MODEL_PATHS, + ) + + # Create serving chat instance + serving_chat = OpenAIServingChat( + engine_client=engine, + models=models, + response_role="assistant", + chat_template=None, + chat_template_content_format="auto", + request_logger=None, + ) + # Create a chat completion request + req = ChatCompletionRequest( + model=MODEL_NAME, + messages=[{"role": "user", "content": TEXT_PROMPT}], + max_tokens=100, + temperature=1.0, + seed=33, + ) + # Test 1: Valid DP rank (0) + mock_raw_request = MagicMock() + mock_raw_request.headers = {"X-data-parallel-rank": "0"} + mock_raw_request.state = MagicMock() + + # Should succeed with valid rank + response = await serving_chat.create_chat_completion(req, mock_raw_request) + assert isinstance(response, ChatCompletionResponse), ( + "Expected a ChatCompletionResponse for valid DP rank" + ) + + # Test 2: Out-of-range DP rank (1) + mock_raw_request.headers = {"X-data-parallel-rank": "1"} + + # should return ErrorResponse for out-of-range rank + response2 = await serving_chat.create_chat_completion(req, mock_raw_request) + assert isinstance(response2, ErrorResponse), ( + "Expected an ErrorResponse for out-of-range DP rank" + ) + + @pytest.mark.asyncio async def test_check_health(): """Test that check_health returns normally for healthy engine diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 95df373502bfd..04967cbe268dd 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -381,6 +381,7 @@ class OpenAIServingChat(OpenAIServing): lora_request=lora_request, trace_headers=trace_headers, priority=request.priority, + data_parallel_rank=data_parallel_rank, ) generator = self.engine_client.generate( diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 1be0afc8c74e5..265ca9915e5db 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -230,6 +230,7 @@ class OpenAIServingCompletion(OpenAIServing): lora_request=lora_request, trace_headers=trace_headers, priority=request.priority, + data_parallel_rank=data_parallel_rank, ) generator = self.engine_client.generate( diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 5f7cfaa53ec18..b9771963c6d4c 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -1231,6 +1231,7 @@ class OpenAIServing: lora_request: LoRARequest | None, trace_headers: Mapping[str, str] | None, priority: int, + data_parallel_rank: int | None = None, ) -> tuple[EngineCoreRequest, dict[str, Any]]: """Use the Processor to process inputs for AsyncLLM.""" tokenization_kwargs: dict[str, Any] = {} @@ -1246,6 +1247,7 @@ class OpenAIServing: tokenization_kwargs=tokenization_kwargs, trace_headers=trace_headers, priority=priority, + data_parallel_rank=data_parallel_rank, ) return engine_request, tokenization_kwargs From 62be3670cb97e7196c30be26fe347d12d183429c Mon Sep 17 00:00:00 2001 From: Alec <35311602+alec-flowers@users.noreply.github.com> Date: Thu, 18 Dec 2025 09:52:55 -0800 Subject: [PATCH 071/176] [BugFix] Add sleep to fix tight loop and release GIL (#29476) Signed-off-by: alec-flowers Signed-off-by: Alec <35311602+alec-flowers@users.noreply.github.com> Co-authored-by: Nick Hill --- vllm/v1/engine/core.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 0045b8c1dd3e7..9e2571201a684 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -923,6 +923,13 @@ class EngineCoreProc(EngineCore): # Post-step hook. self.post_step(model_executed) + # If no model execution happened but there are waiting requests + # (e.g., WAITING_FOR_REMOTE_KVS), yield the GIL briefly to allow + # background threads (like NIXL handshake) to make progress. + # Without this, the tight polling loop can starve background threads. + if not model_executed and self.scheduler.has_unfinished_requests(): + time.sleep(0.001) + return model_executed def _handle_client_request( From 700a5ad6c616358f42db7d9b55e8bc9caa140ca5 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Fri, 19 Dec 2025 02:04:19 +0800 Subject: [PATCH 072/176] [MM Encoder]: Migrate legacy ViT `MultiHeadAttention` to new `MMEncoderAttention` interface (#30684) Signed-off-by: Isotr0py --- tests/kernels/attention/test_attention.py | 5 +- tests/kernels/attention/test_mha_attn.py | 78 +++++++++-- tests/v1/tpu/test_mha_attn.py | 6 +- vllm/attention/layer.py | 132 ------------------ vllm/attention/layers/mm_encoder_attention.py | 90 ++++-------- vllm/attention/ops/vit_attn_wrappers.py | 53 +++++-- vllm/model_executor/models/aimv2.py | 4 +- vllm/model_executor/models/blip.py | 4 +- vllm/model_executor/models/clip.py | 11 +- vllm/model_executor/models/deepencoder.py | 4 +- vllm/model_executor/models/glm4v.py | 4 +- vllm/model_executor/models/hunyuan_vision.py | 4 +- .../models/idefics2_vision_model.py | 8 +- vllm/model_executor/models/intern_vit.py | 4 +- vllm/model_executor/models/interns1_vit.py | 8 +- vllm/model_executor/models/mllama4.py | 4 +- vllm/model_executor/models/molmo.py | 5 +- vllm/model_executor/models/siglip.py | 10 +- vllm/model_executor/models/step3_vl.py | 8 +- vllm/model_executor/models/whisper.py | 6 +- 20 files changed, 182 insertions(+), 266 deletions(-) diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py index 1a7d5ce0ddc1e..96bdcf16d5689 100644 --- a/tests/kernels/attention/test_attention.py +++ b/tests/kernels/attention/test_attention.py @@ -9,7 +9,8 @@ import torch from tests.kernels.allclose_default import get_default_atol, get_default_rtol from tests.kernels.utils import opcheck from vllm import _custom_ops as ops -from vllm.attention.layer import Attention, MultiHeadAttention +from vllm.attention.layer import Attention +from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.platforms import current_platform from vllm.utils.mem_utils import get_max_shared_memory_bytes @@ -442,7 +443,7 @@ def ref_multi_query_kv_attention( return torch.cat(ref_outputs, dim=0) -@pytest.mark.parametrize("attention_cls", [Attention, MultiHeadAttention]) +@pytest.mark.parametrize("attention_cls", [Attention, MMEncoderAttention]) def test_num_heads_not_divisble_by_num_kv_heads(attention_cls: type) -> None: head_size = 64 scale = float(1.0 / (head_size**0.5)) diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py index 639abdf6f0487..7405e4d41da94 100644 --- a/tests/kernels/attention/test_mha_attn.py +++ b/tests/kernels/attention/test_mha_attn.py @@ -3,16 +3,17 @@ """ Test: -* Tests for MultiHeadAttention layer +* Tests for MMEncoderAttention layer """ +import itertools from unittest.mock import patch import pytest import torch from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.layer import MultiHeadAttention +from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.attention.selector import _cached_get_attn_backend from vllm.platforms import current_platform from vllm.platforms.cpu import CpuPlatform @@ -42,35 +43,31 @@ def test_mha_attn_platform(device: str): if device == "cpu": with ( - patch("vllm.attention.layer.current_platform", CpuPlatform()), patch("vllm.model_executor.models.vision.current_platform", CpuPlatform()), ): - attn = MultiHeadAttention(16, 64, scale=1) + attn = MMEncoderAttention(16, 64, scale=1) assert attn.attn_backend == AttentionBackendEnum.TORCH_SDPA elif device == "hip": with ( - patch("vllm.attention.layer.current_platform", RocmPlatform()), patch("vllm.model_executor.models.vision.current_platform", RocmPlatform()), ): - attn = MultiHeadAttention(16, 64, scale=1) + attn = MMEncoderAttention(16, 64, scale=1) assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN else: # Test CUDA with head_size=64 (divisible by 32) # - should use vLLM's FlashAttention with ( - patch("vllm.attention.layer.current_platform", CudaPlatform()), patch("vllm.model_executor.models.vision.current_platform", CudaPlatform()), ): - attn = MultiHeadAttention(16, 64, scale=1) + attn = MMEncoderAttention(16, 64, scale=1) assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN # Test CUDA with head_size=72 (not divisible by 32) # - should use vLLM's FlashAttention with ( - patch("vllm.attention.layer.current_platform", CudaPlatform()), patch("vllm.model_executor.models.vision.current_platform", CudaPlatform()), ): - attn = MultiHeadAttention(16, 72, scale=1) + attn = MMEncoderAttention(16, 72, scale=1) assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN @@ -94,6 +91,10 @@ def ref_attention( BATCH_SIZES = [1, 16] SEQ_LENS = [1] +VAR_SEQ_LENS = [ + [2, 2], + [2, 3, 4], +] NUM_HEADS = [1, 16] NUM_KV_HEADS = [1] HEAD_SIZES = [64, 80] @@ -130,7 +131,7 @@ def test_mha_attn_forward( k = torch.randn(batch_size, seq_len, num_kv_heads * head_size) v = torch.randn(batch_size, seq_len, num_kv_heads * head_size) scale = 1.0 / head_size**0.5 - attn = MultiHeadAttention( + attn = MMEncoderAttention( num_heads, head_size, scale=scale, num_kv_heads=num_kv_heads ) output = attn(q, k, v) @@ -151,3 +152,58 @@ def test_mha_attn_forward( scale=scale, ).reshape(batch_size, seq_len, num_heads * head_size) torch.testing.assert_close(output, ref_output) + + +@pytest.mark.parametrize("var_seq_len", VAR_SEQ_LENS) +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("num_kv_heads", NUM_KV_HEADS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_mha_attn_varlen_forward( + var_seq_len: list[int], + num_heads: int, + num_kv_heads: int, + head_size: int, + dtype: torch.dtype, + device: str, +): + current_platform.seed_everything(0) + torch.set_default_device(device) + torch.set_default_dtype(dtype) + + q = torch.randn(1, sum(var_seq_len), num_heads, head_size) + k = torch.randn(1, sum(var_seq_len), num_kv_heads, head_size) + v = torch.randn(1, sum(var_seq_len), num_kv_heads, head_size) + cu_seqlens = torch.tensor( + [0] + list(itertools.accumulate(var_seq_len)), dtype=torch.int32 + ) + scale = 1.0 / head_size**0.5 + attn = MMEncoderAttention( + num_heads, head_size, scale=scale, num_kv_heads=num_kv_heads + ) + output = attn( + q, k, v, cu_seqlens=cu_seqlens, max_seqlen=torch.tensor(max(var_seq_len)) + ) + + assert num_heads % num_kv_heads == 0 + num_queries_per_kv = num_heads // num_kv_heads + if num_queries_per_kv > 1: + k = torch.repeat_interleave(k, num_queries_per_kv, dim=2) + v = torch.repeat_interleave(v, num_queries_per_kv, dim=2) + + ref_output = [] + for q_i, k_i, v_i in zip( + torch.split(q, var_seq_len, dim=1), + torch.split(k, var_seq_len, dim=1), + torch.split(v, var_seq_len, dim=1), + ): + output_i = ref_attention( + q_i, + k_i, + v_i, + scale=scale, + ) + ref_output.append(output_i) + ref_output = torch.cat(ref_output, dim=1) + torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2) diff --git a/tests/v1/tpu/test_mha_attn.py b/tests/v1/tpu/test_mha_attn.py index 5debdf85bea8d..84968dee6b60c 100644 --- a/tests/v1/tpu/test_mha_attn.py +++ b/tests/v1/tpu/test_mha_attn.py @@ -3,7 +3,7 @@ """ Test: -* Tests for MultiHeadAttention layer +* Tests for MMEncoderAttention layer """ import pytest @@ -12,7 +12,7 @@ import torch_xla import torch_xla.core import torch_xla.core.xla_model -from vllm.attention.layer import MultiHeadAttention +from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.attention.selector import _cached_get_attn_backend from vllm.platforms import current_platform @@ -69,7 +69,7 @@ def test_mha_attn_forward( k = torch.randn(batch_size, seq_len, num_kv_heads * head_size, device=device) v = torch.randn(batch_size, seq_len, num_kv_heads * head_size, device=device) scale = 1.0 / head_size**0.5 - attn = MultiHeadAttention( + attn = MMEncoderAttention( num_heads, head_size, scale=scale, num_kv_heads=num_kv_heads ) output = attn(q, k, v) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 7ef77db8fbb5b..1d882eb87bfde 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -2,12 +2,10 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention layer.""" -import functools from typing import cast import torch import torch.nn as nn -import torch.nn.functional as F import vllm.envs as envs from vllm.attention.backends.abstract import ( @@ -16,13 +14,10 @@ from vllm.attention.backends.abstract import ( MLAAttentionImpl, ) from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.layers.mm_encoder_attention import maybe_get_vit_flash_attn_backend from vllm.attention.selector import get_attn_backend -from vllm.attention.utils.fa_utils import get_flash_attn_version from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target from vllm.attention.utils.kv_transfer_utils import maybe_transfer_kv_layer from vllm.config import CacheConfig, get_current_vllm_config -from vllm.config.multimodal import MultiModalConfig from vllm.config.vllm import VllmConfig from vllm.forward_context import ForwardContext, get_forward_context from vllm.logger import init_logger @@ -36,7 +31,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape -from vllm.model_executor.models.vision import get_vit_attn_backend from vllm.platforms import current_platform from vllm.utils.torch_utils import ( direct_register_custom_op, @@ -412,132 +406,6 @@ class Attention(nn.Module, AttentionLayerBase): ) -class MultiHeadAttention(nn.Module): - """Multi-headed attention without any cache, used for ViT.""" - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int | None = None, - # This has no effect, it is only here to make it easier to swap - # between Attention and MultiHeadAttention - prefix: str = "", - multimodal_config: MultiModalConfig | None = None, - ) -> None: - super().__init__() - self.num_heads = num_heads - self.head_size = head_size - self.scale = scale - self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads - self.layer_name = prefix - - assert self.num_heads % self.num_kv_heads == 0, ( - f"num_heads ({self.num_heads}) is not " - f"divisible by num_kv_heads ({self.num_kv_heads})" - ) - self.num_queries_per_kv = self.num_heads // self.num_kv_heads - - # During model initialization, the default dtype is set as the model - # weight and activation dtype. - dtype = torch.get_default_dtype() - - # Determine the attention backend - attn_backend_override = None - if multimodal_config is not None: - attn_backend_override = multimodal_config.mm_encoder_attn_backend - - self.attn_backend = get_vit_attn_backend( - head_size=head_size, - dtype=dtype, - attn_backend_override=attn_backend_override, - ) - - self._flash_attn_varlen_func = maybe_get_vit_flash_attn_backend( - self.attn_backend, - ) - - self.is_flash_attn_backend = self.attn_backend in { - AttentionBackendEnum.FLASH_ATTN, - AttentionBackendEnum.ROCM_AITER_FA, - } - - self.fa_version = None - if ( - self.attn_backend == AttentionBackendEnum.FLASH_ATTN - and current_platform.is_cuda() - ): - self.fa_version = get_flash_attn_version() - assert self._flash_attn_varlen_func is not None - self._flash_attn_varlen_func = functools.partial( - self._flash_attn_varlen_func, fa_version=self.fa_version - ) - - logger.info_once( - f"Using {self.attn_backend} for MultiHeadAttention in multimodal encoder." - ) - - def forward( - self, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - ) -> torch.Tensor: - """Input shape: - (batch_size x seq_len x hidden_size) or - (batch_size x seq_len x num_heads x head_size) - """ - bsz, q_len = query.size()[:2] - kv_len = key.size(1) - - query = query.view(bsz, q_len, self.num_heads, self.head_size) - key = key.view(bsz, kv_len, self.num_kv_heads, self.head_size) - value = value.view(bsz, kv_len, self.num_kv_heads, self.head_size) - - if (num_repeat := self.num_queries_per_kv) > 1: - # Handle MQA and GQA - key = torch.repeat_interleave(key, num_repeat, dim=2) - value = torch.repeat_interleave(value, num_repeat, dim=2) - - if self.is_flash_attn_backend: - assert self._flash_attn_varlen_func is not None - cu_seqlens_q = torch.arange( - 0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=query.device - ) - cu_seqlens_k = torch.arange( - 0, (bsz + 1) * kv_len, step=kv_len, dtype=torch.int32, device=key.device - ) - - out = self._flash_attn_varlen_func( - query.flatten(0, 1), - key.flatten(0, 1), - value.flatten(0, 1), - cu_seqlens_q=cu_seqlens_q, - cu_seqlens_k=cu_seqlens_k, - max_seqlen_q=q_len, - max_seqlen_k=kv_len, - softmax_scale=self.scale, - ) - elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA: - query, key, value = (x.transpose(1, 2) for x in (query, key, value)) - out = F.scaled_dot_product_attention(query, key, value, scale=self.scale) - out = out.transpose(1, 2) - elif self.attn_backend == AttentionBackendEnum.PALLAS: - query, key, value = (x.transpose(1, 2) for x in (query, key, value)) - from torch_xla.experimental.custom_kernel import flash_attention - - out = flash_attention(query, key, value, sm_scale=self.scale) - out = out.transpose(1, 2) - else: - # ViT attention hasn't supported this backend yet - raise NotImplementedError( - f"ViT attention hasn't supported {self.attn_backend} backend yet." - ) - - return out.reshape(bsz, q_len, -1) - - class MLAAttention(nn.Module, AttentionLayerBase): """Multi-Head Latent Attention layer. diff --git a/vllm/attention/layers/mm_encoder_attention.py b/vllm/attention/layers/mm_encoder_attention.py index 8b3dee1340b9f..25f54cc867b5a 100644 --- a/vllm/attention/layers/mm_encoder_attention.py +++ b/vllm/attention/layers/mm_encoder_attention.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable import torch @@ -19,27 +18,6 @@ from vllm.model_executor.models.vision import get_vit_attn_backend logger = init_logger(__name__) -def maybe_get_vit_flash_attn_backend( - attn_backend: AttentionBackendEnum | None, -) -> Callable | None: - # At this point, - # we already have the attn_backend, - # overriding logic is done in the platform-specific implementation. - # so we don't need to override backend here. - # Just return the attn_backend and flash_attn_varlen_func. - - if attn_backend == AttentionBackendEnum.FLASH_ATTN: - from vllm.attention.utils.fa_utils import flash_attn_varlen_func - elif attn_backend == AttentionBackendEnum.ROCM_AITER_FA: - from aiter import flash_attn_varlen_func - else: - flash_attn_varlen_func = None - - # if attn_backend is TORCH_SDPA, - # it will reach here and the flash_attn_varlen_func will be None. - return flash_attn_varlen_func - - @CustomOp.register("mm_encoder_attn") class MMEncoderAttention(CustomOp): """Multi-headed attention without any cache, used for multimodal encoder.""" @@ -98,21 +76,17 @@ class MMEncoderAttention(CustomOp): AttentionBackendEnum.ROCM_AITER_FA, } - self.flash_attn_varlen_func = maybe_get_vit_flash_attn_backend( - self.attn_backend, + self._fa_version = ( + get_flash_attn_version() if self.is_flash_attn_backend else None ) - if self.is_flash_attn_backend: - assert self.flash_attn_varlen_func is not None - self._fa_version = get_flash_attn_version() - logger.info_once(f"Using {self.attn_backend} for MMEncoderAttention.") @classmethod def enabled(cls) -> bool: return True - def reshape_qkv_to_4d( + def maybe_reshape_qkv_to_4d( self, query: torch.Tensor, key: torch.Tensor, @@ -136,30 +110,6 @@ class MMEncoderAttention(CustomOp): return query, key, value - def reshape_qkv_to_3d( - self, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - bsz: int, - q_len: int, - kv_len: int, - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ - Reshape query, key, value to 3D tensors: - (batch_size * seq_len, num_heads, head_size) - """ - query = query.view(bsz * q_len, self.num_heads, self.head_size) - key = key.view(bsz * kv_len, self.num_kv_heads, self.head_size) - value = value.view(bsz * kv_len, self.num_kv_heads, self.head_size) - - if (num_repeat := self.num_queries_per_kv) > 1: - # Handle MQA and GQA - key = torch.repeat_interleave(key, num_repeat, dim=1) - value = torch.repeat_interleave(value, num_repeat, dim=1) - - return query, key, value - def _forward_sdpa( self, query: torch.Tensor, @@ -167,13 +117,15 @@ class MMEncoderAttention(CustomOp): value: torch.Tensor, cu_seqlens: torch.Tensor | None = None, ) -> torch.Tensor: - # TODO(Isotr0py): Migrate MultiHeadAttention - assert cu_seqlens is not None - + """Input shape: + (batch_size x seq_len x hidden_size) or + (batch_size x seq_len x num_heads x head_size) + """ bsz, q_len = query.size()[:2] kv_len = key.size(1) + is_reshaped = query.dim() != 4 - query, key, value = self.reshape_qkv_to_4d( + query, key, value = self.maybe_reshape_qkv_to_4d( query, key, value, bsz, q_len, kv_len ) @@ -183,6 +135,8 @@ class MMEncoderAttention(CustomOp): v=value, cu_seqlens=cu_seqlens, ) + if is_reshaped: + output = output.view(bsz, q_len, -1) return output def _forward_fa( @@ -193,13 +147,21 @@ class MMEncoderAttention(CustomOp): cu_seqlens: torch.Tensor | None = None, max_seqlen: torch.Tensor | None = None, # Only used for Flash Attention ) -> torch.Tensor: - assert self.flash_attn_varlen_func is not None, ( - "Flash attention function is not set." - ) - # # TODO(Isotr0py): Migrate MultiHeadAttention - assert cu_seqlens is not None and max_seqlen is not None + """Input shape: + (batch_size x seq_len x hidden_size) or + (batch_size x seq_len x num_heads x head_size) + """ + assert (cu_seqlens is not None and max_seqlen is not None) or ( + cu_seqlens is None and max_seqlen is None + ), "cu_seqlens and max_seqlen should be both set or both None." - bsz = query.shape[0] + bsz, q_len = query.size()[:2] + kv_len = key.size(1) + is_reshaped = query.dim() != 4 + + query, key, value = self.maybe_reshape_qkv_to_4d( + query, key, value, bsz, q_len, kv_len + ) output = vit_flash_attn_wrapper( q=query, @@ -211,6 +173,8 @@ class MMEncoderAttention(CustomOp): is_rocm_aiter=(self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA), fa_version=self._fa_version, ) + if is_reshaped: + output = output.view(bsz, q_len, -1) return output def forward_native( diff --git a/vllm/attention/ops/vit_attn_wrappers.py b/vllm/attention/ops/vit_attn_wrappers.py index f555147bc055a..2204382a35e2a 100644 --- a/vllm/attention/ops/vit_attn_wrappers.py +++ b/vllm/attention/ops/vit_attn_wrappers.py @@ -24,11 +24,11 @@ def flash_attn_maxseqlen_wrapper( q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - cu_seqlens: torch.Tensor, - max_seqlen: torch.Tensor, batch_size: int, is_rocm_aiter: bool, fa_version: int | None, + cu_seqlens: torch.Tensor | None = None, + max_seqlen: torch.Tensor | None = None, ) -> torch.Tensor: kwargs = {} if is_rocm_aiter: @@ -38,6 +38,14 @@ def flash_attn_maxseqlen_wrapper( if not current_platform.is_rocm() and fa_version is not None: kwargs["fa_version"] = fa_version + + q_len = q.size(1) + if cu_seqlens is None: + cu_seqlens = torch.arange( + 0, (batch_size + 1) * q_len, step=q_len, dtype=torch.int32, device=q.device + ) + max_seqlen = q_len if max_seqlen is None else max_seqlen.item() + q, k, v = (einops.rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) output = flash_attn_varlen_func( q, @@ -45,8 +53,8 @@ def flash_attn_maxseqlen_wrapper( v, cu_seqlens_q=cu_seqlens, cu_seqlens_k=cu_seqlens, - max_seqlen_q=max_seqlen.item(), - max_seqlen_k=max_seqlen.item(), + max_seqlen_q=max_seqlen, + max_seqlen_k=max_seqlen, dropout_p=0.0, causal=False, **kwargs, @@ -79,24 +87,42 @@ def vit_flash_attn_wrapper( q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - cu_seqlens: torch.Tensor, - max_seqlen: torch.Tensor, batch_size: int, is_rocm_aiter: bool, fa_version: int | None, + cu_seqlens: torch.Tensor | None = None, + max_seqlen: torch.Tensor | None = None, ) -> torch.Tensor: return torch.ops.vllm.flash_attn_maxseqlen_wrapper( - q, k, v, cu_seqlens, max_seqlen, batch_size, is_rocm_aiter, fa_version + q, + k, + v, + batch_size, + is_rocm_aiter, + fa_version, + cu_seqlens, + max_seqlen, ) +def apply_sdpa(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor: + """ + Input shape: + (batch_size x seq_len x num_heads x head_size) + """ + q, k, v = (einops.rearrange(x, "b s h d -> b h s d") for x in [q, k, v]) + output = F.scaled_dot_product_attention(q, k, v, dropout_p=0.0) + output = einops.rearrange(output, "b h s d -> b s h d ") + return output + + # TODO: Once we have a torch 2.10, we can use tensor slices # so we won't need to wrap this in custom ops def torch_sdpa_wrapper( q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - cu_seqlens: torch.Tensor, + cu_seqlens: torch.Tensor | None = None, ) -> torch.Tensor: # Never remove the contiguous logic for ROCm # Without it, hallucinations occur with the backend @@ -105,6 +131,9 @@ def torch_sdpa_wrapper( k = k.contiguous() v = v.contiguous() + if cu_seqlens is None: + return apply_sdpa(q, k, v) + outputs = [] lens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() @@ -112,11 +141,7 @@ def torch_sdpa_wrapper( k_chunks = torch.split(k, lens, dim=1) v_chunks = torch.split(v, lens, dim=1) for q_i, k_i, v_i in zip(q_chunks, k_chunks, v_chunks): - q_i, k_i, v_i = ( - einops.rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i] - ) - output_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0) - output_i = einops.rearrange(output_i, "b h s d -> b s h d ") + output_i = apply_sdpa(q_i, k_i, v_i) outputs.append(output_i) context_layer = torch.cat(outputs, dim=1) return context_layer @@ -142,6 +167,6 @@ def vit_torch_sdpa_wrapper( q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - cu_seqlens: torch.Tensor, + cu_seqlens: torch.Tensor | None = None, ) -> torch.Tensor: return torch.ops.vllm.torch_sdpa_wrapper(q, k, v, cu_seqlens) diff --git a/vllm/model_executor/models/aimv2.py b/vllm/model_executor/models/aimv2.py index 3d000f3ac3ab5..96ca27ad02504 100644 --- a/vllm/model_executor/models/aimv2.py +++ b/vllm/model_executor/models/aimv2.py @@ -8,7 +8,7 @@ from collections.abc import Iterable import torch import torch.nn as nn -from vllm.attention.layer import MultiHeadAttention +from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.utils import divide from vllm.model_executor.layers.activation import SiluAndMul @@ -126,7 +126,7 @@ class AIMv2Attention(nn.Module): self.tp_size = get_tensor_model_parallel_world_size() self.num_heads_per_partition = divide(self.num_heads, self.tp_size) - self.attn = MultiHeadAttention( + self.attn = MMEncoderAttention( self.num_heads_per_partition, self.head_dim, self.scale ) diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index f31f99c0592b2..7387830b32bdc 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -9,7 +9,7 @@ import torch import torch.nn as nn from transformers import Blip2VisionConfig, BlipVisionConfig -from vllm.attention.layer import MultiHeadAttention +from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.conv import Conv2dLayer @@ -122,7 +122,7 @@ class BlipAttention(nn.Module): self.tp_size = get_tensor_model_parallel_world_size() self.num_heads_per_partition = divide(self.num_heads, self.tp_size) - self.attn = MultiHeadAttention( + self.attn = MMEncoderAttention( self.num_heads_per_partition, self.head_dim, self.scale ) diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 22f3ecad748e6..8e77b36e6feb5 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -14,7 +14,8 @@ from transformers import ( CLIPVisionConfig, ) -from vllm.attention.layer import Attention, MultiHeadAttention +from vllm.attention.layer import Attention +from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import divide, get_tensor_model_parallel_world_size @@ -354,7 +355,7 @@ class CLIPAttention(nn.Module): quant_config: QuantizationConfig | None = None, *, prefix: str = "", - attn_cls: type[Attention] | type[MultiHeadAttention], + attn_cls: type[Attention] | type[MMEncoderAttention], ) -> None: super().__init__() @@ -449,7 +450,7 @@ class CLIPEncoderLayer(nn.Module): quant_config: QuantizationConfig | None = None, *, prefix: str = "", - attn_cls: type[Attention] | type[MultiHeadAttention], + attn_cls: type[Attention] | type[MMEncoderAttention], ) -> None: super().__init__() self.self_attn = CLIPAttention( @@ -493,7 +494,7 @@ class CLIPEncoder(nn.Module): num_hidden_layers_override: int | None = None, *, prefix: str = "", - attn_cls: type[Attention] | type[MultiHeadAttention], + attn_cls: type[Attention] | type[MMEncoderAttention], ) -> None: super().__init__() @@ -638,7 +639,7 @@ class CLIPVisionTransformer(nn.Module): quant_config=quant_config, num_hidden_layers_override=num_hidden_layers_override, prefix=f"{prefix}.encoder", - attn_cls=MultiHeadAttention, + attn_cls=MMEncoderAttention, ) num_hidden_layers = config.num_hidden_layers diff --git a/vllm/model_executor/models/deepencoder.py b/vllm/model_executor/models/deepencoder.py index 8f1660891fcbf..045445d23b8f3 100644 --- a/vllm/model_executor/models/deepencoder.py +++ b/vllm/model_executor/models/deepencoder.py @@ -18,7 +18,7 @@ import torch.nn as nn import torch.nn.functional as F from transformers import CLIPVisionConfig -from vllm.attention.layer import MultiHeadAttention +from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -628,7 +628,7 @@ class DeepCLIPVisionTransformer(nn.Module): quant_config=quant_config, num_hidden_layers_override=num_hidden_layers_override, prefix=f"{prefix}.encoder", - attn_cls=MultiHeadAttention, + attn_cls=MMEncoderAttention, ) num_hidden_layers = config.num_hidden_layers diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py index ec5af94e297c1..453a7812a1748 100644 --- a/vllm/model_executor/models/glm4v.py +++ b/vllm/model_executor/models/glm4v.py @@ -19,7 +19,7 @@ from transformers import BatchFeature, PreTrainedTokenizer, TensorType from transformers.image_utils import ImageInput from transformers.tokenization_utils_base import TextInput -from vllm.attention.layer import MultiHeadAttention +from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size @@ -135,7 +135,7 @@ class EVA2CLIPAttention(nn.Module): prefix=f"{prefix}.dense", ) - self.attn = MultiHeadAttention( + self.attn = MMEncoderAttention( self.num_heads_per_rank, self.head_dim, self.scale ) self.output_dropout = torch.nn.Dropout(config.dropout_prob) diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py index be084f4ee0f8e..6fc56094af650 100644 --- a/vllm/model_executor/models/hunyuan_vision.py +++ b/vllm/model_executor/models/hunyuan_vision.py @@ -34,7 +34,7 @@ import torch.nn.functional as F from transformers import BatchFeature from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.layer import MultiHeadAttention +from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.config import MultiModalConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import parallel_state @@ -232,7 +232,7 @@ class HunYuanVisionAttention(nn.Module): ) self.scale = self.hidden_size_per_attention_head**-0.5 - self.attn = MultiHeadAttention( + self.attn = MMEncoderAttention( self.num_attention_heads_per_partition, self.hidden_size_per_attention_head, self.scale, diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index 06b8468e18db9..ee6ca5eacb176 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -27,7 +27,7 @@ from transformers.models.idefics2.configuration_idefics2 import ( Idefics2VisionConfig, ) -from vllm.attention.layer import MultiHeadAttention +from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.conv import Conv2dLayer @@ -161,8 +161,8 @@ class Idefics2VisionAttention(nn.Module): prefix=f"{prefix}.out_proj", disable_tp=use_data_parallel, ) - # Use unified MultiHeadAttention with Flash Attention support - self.attn = MultiHeadAttention( + # Use unified MMEncoderAttention with Flash Attention support + self.attn = MMEncoderAttention( self.num_heads_per_partition, self.head_dim, self.scale ) @@ -175,7 +175,7 @@ class Idefics2VisionAttention(nn.Module): ) # batch_size, q_len, 3 * num_heads_per_partition * head_dim query_states, key_states, value_states = qkv.chunk(3, dim=-1) - # Use unified MultiHeadAttention implementation + # Use unified MMEncoderAttention implementation out = self.attn(query_states, key_states, value_states) attn_output, _ = self.out_proj(out) return attn_output diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 61aeafc2ab436..5f7ba838aa3d9 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -15,7 +15,7 @@ import torch.nn as nn import torch.nn.functional as F from transformers import PretrainedConfig -from vllm.attention.layer import MultiHeadAttention +from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.distributed import ( divide, get_tensor_model_parallel_rank, @@ -207,7 +207,7 @@ class InternParallelAttention(nn.Module): disable_tp=use_data_parallel, ) - self.attn = MultiHeadAttention( + self.attn = MMEncoderAttention( self.num_heads_per_partition, self.head_dim, self.scale ) diff --git a/vllm/model_executor/models/interns1_vit.py b/vllm/model_executor/models/interns1_vit.py index cb0414bbc95a8..a16857d613226 100644 --- a/vllm/model_executor/models/interns1_vit.py +++ b/vllm/model_executor/models/interns1_vit.py @@ -14,7 +14,7 @@ import torch.nn as nn from transformers import PretrainedConfig from transformers.utils import torch_int -from vllm.attention.layer import MultiHeadAttention +from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.layernorm import RMSNorm @@ -214,8 +214,8 @@ class InternSdpaAttention(nn.Module): self.projection_layer = nn.Linear(self.dummy_dim, self.embed_dim) - # Use unified MultiHeadAttention with automatic backend selection - self.attn = MultiHeadAttention(self.num_heads, self.head_dim, self.scale) + # Use unified MMEncoderAttention with automatic backend selection + self.attn = MMEncoderAttention(self.num_heads, self.head_dim, self.scale) def forward(self, x: torch.Tensor) -> torch.Tensor: """x shape: (B, N, C)""" @@ -228,7 +228,7 @@ class InternSdpaAttention(nn.Module): q = self.q_norm(q) k = self.k_norm(k) - # Use unified MultiHeadAttention with automatic backend selection + # Use unified MMEncoderAttention with automatic backend selection x = self.attn(q, k, v) x = self.projection_layer(x) diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index fe963cc6644fb..886d5151e43ff 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -31,7 +31,7 @@ from transformers.models.llama4.image_processing_llama4_fast import ( get_best_fit, ) -from vllm.attention.layer import MultiHeadAttention +from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size @@ -255,7 +255,7 @@ class Llama4VisionAttention(nn.Module): self.attention_dropout = config.attention_dropout self.scaling = self.head_dim**-0.5 - self.attn = MultiHeadAttention( + self.attn = MMEncoderAttention( self.num_local_heads, self.head_dim, self.scaling ) diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 71c6b1aa2e814..9c741e1f5071f 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -17,7 +17,8 @@ from transformers import BatchFeature, PretrainedConfig, ProcessorMixin, TensorT from transformers.image_utils import ImageInput from transformers.tokenization_utils_base import TextInput -from vllm.attention.layer import Attention, MultiHeadAttention +from vllm.attention.layer import Attention +from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions @@ -222,7 +223,7 @@ class MultiHeadDotProductAttention(nn.Module): ) self.scale = self.head_dim**-0.5 - self.attn = MultiHeadAttention( + self.attn = MMEncoderAttention( self.num_heads, self.head_dim, self.scale, num_kv_heads=self.num_kv_heads ) diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 2600dc1c9f79c..799afc7ca2e51 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -16,8 +16,8 @@ from transformers import ( SiglipVisionConfig, ) -from vllm.attention.layer import MultiHeadAttention from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention +from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import divide, get_tensor_model_parallel_world_size @@ -379,7 +379,7 @@ class SiglipAttention(nn.Module): quant_config: QuantizationConfig | None = None, *, prefix: str = "", - attn_cls: type[EncoderOnlyAttention] | type[MultiHeadAttention], + attn_cls: type[EncoderOnlyAttention] | type[MMEncoderAttention], ) -> None: super().__init__() @@ -481,7 +481,7 @@ class SiglipEncoderLayer(nn.Module): quant_config: QuantizationConfig | None = None, *, prefix: str = "", - attn_cls: type[EncoderOnlyAttention] | type[MultiHeadAttention], + attn_cls: type[EncoderOnlyAttention] | type[MMEncoderAttention], ) -> None: super().__init__() @@ -527,7 +527,7 @@ class SiglipEncoder(nn.Module): num_hidden_layers_override: int | None = None, *, prefix: str = "", - attn_cls: type[EncoderOnlyAttention] | type[MultiHeadAttention], + attn_cls: type[EncoderOnlyAttention] | type[MMEncoderAttention], ) -> None: super().__init__() @@ -700,7 +700,7 @@ class SiglipVisionTransformer(nn.Module): quant_config=quant_config, num_hidden_layers_override=num_hidden_layers_override, prefix=f"{prefix}.encoder", - attn_cls=MultiHeadAttention, + attn_cls=MMEncoderAttention, ) num_hidden_layers = config.num_hidden_layers diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py index e5038e56a2708..3c965721b9dae 100644 --- a/vllm/model_executor/models/step3_vl.py +++ b/vllm/model_executor/models/step3_vl.py @@ -15,7 +15,7 @@ from torchvision import transforms from torchvision.transforms.functional import InterpolationMode from transformers import BatchFeature, PretrainedConfig, TensorType -from vllm.attention.layer import MultiHeadAttention +from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size @@ -753,8 +753,8 @@ class Step3VisionAttention(nn.Module): disable_tp=use_data_parallel, ) - # Use unified MultiHeadAttention with automatic backend selection - self.attn = MultiHeadAttention(self.num_heads, self.head_dim, self.scale) + # Use unified MMEncoderAttention with automatic backend selection + self.attn = MMEncoderAttention(self.num_heads, self.head_dim, self.scale) def forward( self, @@ -767,7 +767,7 @@ class Step3VisionAttention(nn.Module): qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) - # Use unified MultiHeadAttention with automatic backend selection + # Use unified MMEncoderAttention with automatic backend selection attn_output = self.attn(q, k, v) attn_output, _ = self.out_proj(attn_output) diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index b513e3513b2e2..f5a1e75d99617 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -16,9 +16,9 @@ from transformers import ( ) from transformers.models.whisper.modeling_whisper import sinusoids -from vllm.attention.backends.abstract import AttentionType -from vllm.attention.layer import Attention, MultiHeadAttention +from vllm.attention.layer import Attention, AttentionType from vllm.attention.layers.cross_attention import CrossAttention +from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size @@ -141,7 +141,7 @@ class WhisperAudioInputs(TensorSchema): ] -class WhisperEncoderAttention(MultiHeadAttention): +class WhisperEncoderAttention(MMEncoderAttention): """Multi-headed attention for Whisper encoder with 2D tensor support.""" def forward( From 058926d48c2435496839ec8e7e3ee90683ea7791 Mon Sep 17 00:00:00 2001 From: Fanli Lin Date: Fri, 19 Dec 2025 02:16:36 +0800 Subject: [PATCH 073/176] [XPU] allow custom workers (e.g. vllm-omni workers) to be used on XPU (#30935) Signed-off-by: Fanli Lin --- vllm/platforms/xpu.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index af8979af36643..2d67551eed9f6 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -159,7 +159,10 @@ class XPUPlatform(Platform): # check and update parallel config parallel_config = vllm_config.parallel_config - parallel_config.worker_cls = "vllm.v1.worker.xpu_worker.XPUWorker" + # Only override worker_cls if it's still the default "auto" + # This allows custom workers (like vllm-omni workers) to be used on XPU + if parallel_config.worker_cls == "auto": + parallel_config.worker_cls = "vllm.v1.worker.xpu_worker.XPUWorker" if vllm_config.kv_transfer_config is not None: vllm_config.kv_transfer_config.enable_permute_local_kv = True From 889f8bb250d498dcd38d7bcf58fb3c9e50d54d14 Mon Sep 17 00:00:00 2001 From: wz1qqx <55830058+wz1qqx@users.noreply.github.com> Date: Fri, 19 Dec 2025 03:09:51 +0800 Subject: [PATCH 074/176] [BugFix]Reclaim resources to prevent memory leaks when use LMCacheMPConnector (#30745) Signed-off-by: wz1qqx Co-authored-by: wz1qqx --- .../lmcache_integration/multi_process_adapter.py | 8 ++++++++ .../kv_connector/v1/lmcache_mp_connector.py | 16 ++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py index 6656b5a25f83d..9db4dedf48b7b 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py @@ -147,6 +147,14 @@ class LMCacheMPSchedulerAdapter: """ return self.blocks_in_chunk + def _cleanup_lookup_result(self, request_id: str) -> None: + """ + Clean up lookup future for a finished request to prevent memory leak. + Args: + request_id: The ID of the finished request. + """ + self.lookup_futures.pop(request_id, None) + # Helper functions def _create_key(self, block_hash: bytes) -> IPCCacheEngineKey: """Convert a block hash to an IPC cache engine key""" diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py index 995708b89bc26..29166be62c242 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py @@ -701,6 +701,8 @@ class LMCacheMPConnector(KVConnectorBase_V1): if condition else LMCacheMPRequestState.READY ) + # Clean up lookup future in scheduler adapter + self.scheduler_adapter._cleanup_lookup_result(request.request_id) def build_connector_meta( self, scheduler_output: SchedulerOutput @@ -754,6 +756,8 @@ class LMCacheMPConnector(KVConnectorBase_V1): Optional KVTransferParams to be included in the request outputs returned by the engine. """ + # Clean up request tracker to prevent memory leak + self._cleanup_request_tracker(request.request_id) return True, None def take_events(self) -> Iterable["KVCacheEvent"]: @@ -915,3 +919,15 @@ class LMCacheMPConnector(KVConnectorBase_V1): new_tracker = LMCacheMPRequestTracker(request) self.request_trackers[request_id] = new_tracker return self.request_trackers[request_id] + + def _cleanup_request_tracker(self, request_id: str) -> None: + """ + Clean up request tracker and associated lookup future for a request. + This should be called when a request is finished to prevent memory leak. + """ + # Clean up request tracker + if self.request_trackers.pop(request_id, None): + logger.debug( + "[KVConnector] Cleaned up request_tracker for request %s", + request_id, + ) From 53ad423f2638a3cbb95149928f127e07564581b7 Mon Sep 17 00:00:00 2001 From: jiahanc <173873397+jiahanc@users.noreply.github.com> Date: Thu, 18 Dec 2025 11:31:18 -0800 Subject: [PATCH 075/176] [Perf] enable flashinfer rotary_embedding custom ops in DeepSeek rotary (#30729) Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com> --- .../layers/rotary_embedding/base.py | 5 ++++- .../rotary_embedding/deepseek_scaling_rope.py | 21 ++++++++++++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py index afa69324c4e2e..7e83ea9a1355b 100644 --- a/vllm/model_executor/layers/rotary_embedding/base.py +++ b/vllm/model_executor/layers/rotary_embedding/base.py @@ -38,7 +38,10 @@ class RotaryEmbeddingBase(CustomOp): # and current_platform.is_cuda() # and has_flashinfer() # and self.head_size in [64, 128, 256, 512]) - self.use_flashinfer = False + + # Check if use_flashinfer is already set + if not hasattr(self, "use_flashinfer"): + self.use_flashinfer = False cache = self._compute_cos_sin_cache() if not self.use_flashinfer: diff --git a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py index e72834e473c15..8402b65efcc04 100644 --- a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py +++ b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py @@ -6,6 +6,7 @@ import math import torch from vllm.platforms import current_platform +from vllm.utils.flashinfer import has_flashinfer from .base import RotaryEmbeddingBase from .common import ( @@ -56,6 +57,13 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbeddingBase): / yarn_get_mscale(self.scaling_factor, float(mscale_all_dim)) * attn_factor ) + self.use_flashinfer = ( + self.enabled() + and dtype in (torch.float16, torch.bfloat16) + and current_platform.is_cuda() + and has_flashinfer() + and head_size in [64, 128, 256, 512] + ) super().__init__( head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype ) @@ -162,4 +170,15 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbeddingBase): key: torch.Tensor | None = None, offsets: torch.Tensor | None = None, ) -> tuple[torch.Tensor, torch.Tensor | None]: - return self.forward_native(positions, query, key, offsets) + if self.use_flashinfer: + torch.ops.vllm.flashinfer_rotary_embedding( + torch.add(positions, offsets) if offsets is not None else positions, + query, + key, + self.head_size, + self.cos_sin_cache, + self.is_neox_style, + ) + return query, key + else: + return self.forward_native(positions, query, key, offsets) From b8c477c11502ad9b52e833faff3e48ba25752e04 Mon Sep 17 00:00:00 2001 From: navmarri14 Date: Thu, 18 Dec 2025 11:41:59 -0800 Subject: [PATCH 076/176] tuned fused configs for B300 (#30629) --- ...me=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json | 147 ++++++++++++++++++ ...me=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json | 147 ++++++++++++++++++ ...me=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json | 147 ++++++++++++++++++ 3 files changed, 441 insertions(+) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=160,N=768,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..291a760cb2382 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json @@ -0,0 +1,147 @@ +{ + "triton_version": "3.5.1", + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..a081be65f613b --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json @@ -0,0 +1,147 @@ +{ + "triton_version": "3.5.1", + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=768,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=768,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..49aadc8c9dfd3 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=768,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json @@ -0,0 +1,147 @@ +{ + "triton_version": "3.5.1", + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + } +} From d2dc5dfc6ecafbd3d725c1c42dd019db2b1efd30 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Fri, 19 Dec 2025 03:42:32 +0800 Subject: [PATCH 077/176] [Bugfix] Remove `tile_size=64` for mm_prefix triton attention (#30973) Signed-off-by: Isotr0py --- vllm/attention/ops/triton_unified_attention.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py index ae5a48ec3d26d..f61c8e9b89c24 100644 --- a/vllm/attention/ops/triton_unified_attention.py +++ b/vllm/attention/ops/triton_unified_attention.py @@ -800,7 +800,6 @@ def _get_tile_size( head_size: int, sliding_window: int, element_size: int, - is_mm_prefix: bool, is_prefill: bool, ) -> int: """Select tile size with Gemma3-specific optimization. @@ -809,10 +808,6 @@ def _get_tile_size( the larger head dimension (128/256). For other models, use the default vLLM behavior. """ - if is_mm_prefix: - # Multimodal bidirectional attention needs a larger tile size - return 64 - if _is_gemma3_attention(head_size, sliding_window): # Gemma3: use 32 for decode (default is 16) return 32 @@ -903,14 +898,12 @@ def unified_attention( head_size, sliding_window_val, q.element_size(), - is_mm_prefix=use_mm_prefix, is_prefill=True, ) TILE_SIZE_DECODE = _get_tile_size( head_size, sliding_window_val, q.element_size(), - is_mm_prefix=use_mm_prefix, is_prefill=False, ) From 97000a2be7e318be1a3eb172f9abf2d67dbe73bf Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Thu, 18 Dec 2025 14:45:55 -0500 Subject: [PATCH 078/176] [Bug] Fix compressed tensor not using deepgemm (#30820) Signed-off-by: yewentao256 --- vllm/model_executor/layers/fused_moe/fused_moe.py | 1 - .../compressed_tensors/compressed_tensors_moe.py | 10 ++++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 20782e2712f27..37f8e7780f999 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1696,7 +1696,6 @@ def fused_experts( and (is_deep_gemm_e8m0_used() or _valid_deep_gemm(hidden_states, w1, w2)) ): assert quant_config is not None - assert apply_router_weight_on_input is False return deep_gemm_moe_fp8( hidden_states=hidden_states, w1=w1, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index c302e465aedb7..fc359a3067a9c 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -96,6 +96,7 @@ from vllm.utils.deep_gemm import ( get_col_major_tma_aligned_tensor, get_mk_alignment_for_contiguous_layout, is_deep_gemm_e8m0_used, + is_deep_gemm_supported, ) from vllm.utils.import_utils import has_deep_gemm @@ -716,6 +717,13 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): get_marlin_input_dtype(layer_name) if self.use_marlin else None ) + self.allow_deep_gemm = ( + self.block_quant + and envs.VLLM_MOE_USE_DEEP_GEMM + and is_deep_gemm_supported() + and list(self.weight_block_size) == get_mk_alignment_for_contiguous_layout() + ) + def create_weights( self, layer: torch.nn.Module, @@ -1231,6 +1239,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): if self.disable_expert_map else layer.expert_map, # ??? quant_config=self.moe_quant_config, + allow_deep_gemm=self.allow_deep_gemm, ) else: from vllm.model_executor.layers.fused_moe.cutlass_moe import ( @@ -1272,6 +1281,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): global_num_experts=layer.global_num_experts, expert_map=layer.expert_map, quant_config=self.moe_quant_config, + allow_deep_gemm=self.allow_deep_gemm, ) @property From 41b6f9200fef27cd43b3299408a6f0e50654931f Mon Sep 17 00:00:00 2001 From: Elizabeth Thomas Date: Thu, 18 Dec 2025 13:46:28 -0600 Subject: [PATCH 079/176] Remove all2all backend envvar (#30363) Signed-off-by: Elizabeth Thomas Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .../qwen30b_a3b_fp8_block_ep_eplb.sh | 2 +- .../qwen3_next_mtp_async_eplb.sh | 2 +- .buildkite/test-amd.yaml | 2 +- .buildkite/test-pipeline.yaml | 2 +- .buildkite/test_areas/distributed.yaml | 2 +- docs/design/moe_kernel_features.md | 2 +- .../elastic_ep/serve_deepseek_v2.sh | 2 +- tests/v1/cudagraph/test_cudagraph_dispatch.py | 5 +- vllm/config/compilation.py | 8 ++- vllm/config/parallel.py | 49 +++++++++---------- vllm/engine/arg_utils.py | 2 +- vllm/envs.py | 5 +- 12 files changed, 40 insertions(+), 43 deletions(-) diff --git a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh index 6a1bef275d047..d0921c5699d5d 100644 --- a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh +++ b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh @@ -44,10 +44,10 @@ trap cleanup EXIT for BACK in "${BACKENDS[@]}"; do VLLM_DEEP_GEMM_WARMUP=skip \ - VLLM_ALL2ALL_BACKEND=$BACK \ vllm serve "$MODEL" \ --enforce-eager \ --enable-eplb \ + --all2all-backend $BACK \ --eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \ --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \ --data-parallel-size ${DATA_PARALLEL_SIZE} \ diff --git a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh index 937a43d1a3221..b3b65128e6062 100644 --- a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh +++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh @@ -43,12 +43,12 @@ trap cleanup EXIT for BACK in "${BACKENDS[@]}"; do VLLM_DEEP_GEMM_WARMUP=skip \ - VLLM_ALL2ALL_BACKEND=$BACK \ vllm serve "$MODEL" \ --enforce-eager \ --tensor-parallel-size 4 \ --enable-expert-parallel \ --enable-eplb \ + --all2all-backend $BACK \ --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \ --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \ --trust-remote-code \ diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index f294261ec8c3a..6e20ff3bf38d9 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1497,7 +1497,7 @@ steps: - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - - HIP_VISIBLE_DEVICES=0,1 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 + - HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py ##### B200 test ##### diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 8e3bcfe4a36bc..faf34d95735f4 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -1331,7 +1331,7 @@ steps: - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 + - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py ##### B200 test ##### diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml index 2cc90698d916a..52d57c99fcfb5 100644 --- a/.buildkite/test_areas/distributed.yaml +++ b/.buildkite/test_areas/distributed.yaml @@ -145,7 +145,7 @@ steps: - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4' - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 + - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py - label: Distributed Tests (2 GPUs)(B200) diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md index 48341d199cb80..6c02dcb76bec2 100644 --- a/docs/design/moe_kernel_features.md +++ b/docs/design/moe_kernel_features.md @@ -16,7 +16,7 @@ Async backends support the use of DBO (Dual Batch Overlap) and shared expert ove Certain models require the topk weights to be applied to the input activations rather than the output activations when topk==1, e.g. Llama. For modular kernels, this feature is supported by the `FusedMoEPrepareAndFinalize` subclass. For non-modular kernels, it is up to the experts function to deal with this flag. -Unless otherwise specified, backends are controlled via `VLLM_ALL2ALL_BACKEND`. All backends except `flashinfer` only work with EP+DP or EP+TP. `Flashinfer` can work with EP or DP without EP. +Unless otherwise specified, backends are controlled via the `--all2all-backend` command-line argument (or the `all2all_backend` parameter in `ParallelConfig`). All backends except `flashinfer` only work with EP+DP or EP+TP. `Flashinfer` can work with EP or DP without EP.